diff --git a/evaluation_dashboard_app/.dockerignore b/evaluation_dashboard_app/.dockerignore index 08992ee..ea95081 100644 --- a/evaluation_dashboard_app/.dockerignore +++ b/evaluation_dashboard_app/.dockerignore @@ -3,6 +3,8 @@ __pycache__ .git .gitignore *.md +!Readme.md +!Readme.en.md .env .venv venv diff --git a/evaluation_dashboard_app/Dockerfile b/evaluation_dashboard_app/Dockerfile index 891c8a8..b45eb83 100644 --- a/evaluation_dashboard_app/Dockerfile +++ b/evaluation_dashboard_app/Dockerfile @@ -5,6 +5,7 @@ # Build example: docker build --secret id=ssh,src=$HOME/.ssh/id_rsa -t evaluation-dashboard . # Match ROS distro at build: --build-arg ROS_DISTRO=humble (or iron, jazzy, etc.) ARG ROS_DISTRO=humble +ARG WEBAUTOAUTHCLI_COMMIT=204629123fa58ab0be0966c795703324e34851ec FROM ros:${ROS_DISTRO} WORKDIR /app @@ -12,13 +13,29 @@ WORKDIR /app # Make ROS_DISTRO available inside the container at runtime (for entrypoint sourcing) ENV ROS_DISTRO=${ROS_DISTRO} -# Install needed system packages: python, geos, git/ssh (for pip install from private repos), pipx, OpenGL for matplotlib +# Install needed system packages: python, geos, git/ssh (for pip install from private repos), pipx, +# OpenGL for matplotlib, and Chrome for Kaleido static image export. RUN apt-get update \ && DEBIAN_FRONTEND=noninteractive \ apt-get install -y --no-install-recommends \ - python3-pip python3-venv libgeos-c1v5 bash git openssh-client libgl1-mesa-glx libgl1-mesa-dri \ + python3-pip python3-venv libgeos-c1v5 bash git openssh-client \ + libgl1-mesa-glx libgl1-mesa-dri ca-certificates curl gnupg \ + fontconfig fonts-noto-cjk fonts-ipafont-gothic \ + && install -d -m 0755 /etc/apt/keyrings \ + && curl -fsSL https://dl.google.com/linux/linux_signing_key.pub \ + | gpg --dearmor -o /etc/apt/keyrings/google-chrome.gpg \ + && chmod a+r /etc/apt/keyrings/google-chrome.gpg \ + && echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/google-chrome.gpg] https://dl.google.com/linux/chrome/deb/ stable main" \ + > /etc/apt/sources.list.d/google-chrome.list \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive \ + apt-get install -y --no-install-recommends google-chrome-stable \ + && ln -sf /usr/bin/google-chrome-stable /usr/bin/google-chrome \ + && fc-cache -f \ && rm -rf /var/lib/apt/lists/* +ENV CHROME_BIN=/usr/bin/google-chrome-stable + # Upgrade pip, install pipx, ensure pipx path available RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel packaging pipx \ && pipx ensurepath @@ -32,18 +49,32 @@ RUN --mount=type=secret,id=ssh,dst=/tmp/ssh_key \ && chmod 700 /root/.ssh \ && chmod 600 /root/.ssh/id_rsa \ && export GIT_SSH_COMMAND="ssh -i /root/.ssh/id_rsa -o StrictHostKeyChecking=accept-new" \ + && python3 -m pip install --no-cache-dir "git+ssh://git@github.com/tier4/WebAutoAuthCLI.git@v2.23.1" \ && python3 -m pip install --no-cache-dir "git+ssh://git@github.com/tier4/webauto-auth-py.git" \ && pipx install "git+ssh://git@github.com/tier4/v_and_v_util.git" \ && python3 -m pip install --no-cache-dir "git+ssh://git@github.com/tier4/autoware_perception_evaluation.git" \ - && python3 -m pip install --no-cache-dir "git+ssh://git@github.com/tier4/perception_catalog_analyzer.git" \ + && python3 -m pip install --no-cache-dir \ + "bokeh>=3.7.3" \ + "lz4>=4.4.4" \ + "pyarrow==19.0.0" \ + "tabulate>=0.9.0" \ + "typer>=0.16.0" \ + "weasyprint>=65.1" \ + && git clone --depth 1 git@github.com:tier4/perception_catalog_analyzer.git /opt/perception_catalog_analyzer \ + && python3 -m pip install --no-cache-dir --no-deps -e /opt/perception_catalog_analyzer \ && rm -rf /root/.ssh # Clean up private key ASAP for security -# Install public dependencies (after SSH deps so SSH failures surface fast) +# Install public dependencies (after SSH deps so SSH failures surface fast). +# Keep analyzer runtime deps that are not installed via `-e --no-deps` here too. COPY requirements-docker.txt . RUN python3 -m pip install --no-cache-dir -r requirements-docker.txt + # Copy application code and config COPY Overview.py . +COPY Readme.md . +COPY Readme.en.md . +COPY catalogs.json . COPY pages/ pages/ COPY lib/ lib/ COPY worker/ worker/ diff --git a/evaluation_dashboard_app/Overview.py b/evaluation_dashboard_app/Overview.py old mode 100644 new mode 100755 index 3a08af7..0843bd4 --- a/evaluation_dashboard_app/Overview.py +++ b/evaluation_dashboard_app/Overview.py @@ -1,12 +1,39 @@ import streamlit as st import pandas as pd +import io +import urllib.parse +import zipfile +import yaml from pathlib import Path from lib.run_loader import load_run -from lib.path_utils import get_data_root, get_data_root_display, list_run_directories, path_display +from lib.path_utils import ( + get_data_root, + get_data_root_display, + get_run_display_name, + list_run_directories, + path_display, + resolve_run_subdirectory, +) import plotly.express as px import plotly.graph_objects as go from lib.user_config import UserConfig -from lib.summary_compare import build_summary_delta +from lib.summary_compare import build_summary_delta, summary_delta_overlap_stats +from lib.overview_pdf_report import build_overview_pdf_report, make_report_filename +from lib.specsheet_report import ( + DEFAULT_SPECSHEET_LABELS, + DEFAULT_SPECSHEET_PROJECT_ID, + DEFAULT_SPECSHEET_TOPIC, + DEFAULT_TREND_METADATA_TEXT, + collect_candidate_specsheet_labels, + generate_specsheet_pdf, + get_release_specsheet_context, + get_specsheet_artifact_paths, + is_specsheet_pdf_fresh, + parse_trend_metadata_text, + progress_fraction_from_message, + resolve_specsheet_generation_run_path, + write_trend_metadata, +) from lib.page_chrome import ( inject_app_page_styles, render_loaded_data_section, @@ -30,12 +57,12 @@ # ====== CONFIG AND CONSTANTS ====== st.set_page_config(page_title="Overview", layout="wide", initial_sidebar_state="expanded") inject_app_page_styles() -if running_in_docker(): - st.sidebar.page_link( - "pages/99_Deployment_Debug.py", - label="Deployment debug", - icon="🐳", - ) +# if running_in_docker(): +# st.sidebar.page_link( +# "pages/99_Deployment_Debug.py", +# label="Deployment debug", +# icon="🐳", +# ) RUN_ROOT = get_data_root() PRODUCT_LABEL_JA = { "Occlusion-Case": "遮蔽ケース", @@ -262,7 +289,19 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No # List run directories (subdirectories in RUN_ROOT) run_dirs = list_run_directories() -run_names = [p.name for p in run_dirs] +run_names = [get_run_display_name(p) for p in run_dirs] + + +def _coerce_run_param_to_display_name(value: str | None) -> str: + raw = str(value or "").strip() + if not raw: + return "" + if raw in run_names: + return raw + resolved, err = resolve_run_subdirectory(raw) + if err or resolved is None: + return "" + return get_run_display_name(resolved) if not run_dirs: st.warning(f"No runs found in '{get_data_root_display()}'.\n\nPlease add at least one sub-directory with evaluation results, e.g. `{get_data_root_display()}/my_eval_run/`.") @@ -280,12 +319,14 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No saved_run_a = user_config.get("overview_run_a", run_names[0] if run_names else "") # URL override (only if valid) -if url_run_a in run_names: - saved_run_a = url_run_a +url_run_a_display = _coerce_run_param_to_display_name(url_run_a) +if url_run_a_display in run_names: + saved_run_a = url_run_a_display run_a_index = run_names.index(saved_run_a) if saved_run_a in run_names else 0 -run_a_dir = st.sidebar.selectbox("Baseline (A)", run_dirs, index=run_a_index, format_func=lambda p: p.name) -user_config.set("overview_run_a", run_a_dir.name) +run_a_dir = st.sidebar.selectbox("Baseline (A)", run_dirs, index=run_a_index, format_func=get_run_display_name) +run_a_name = get_run_display_name(run_a_dir) +user_config.set("overview_run_a", run_a_name) compare_run_names = [] # list of run names for candidates B, C, D, ... if mode == "Compare Mode": @@ -298,7 +339,11 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No if not saved_compare and run_names: saved_compare = [run_names[1]] if len(run_names) > 1 else [run_names[0]] if url_compare_runs: - valid_url = [r for r in url_compare_runs if r in run_names] + valid_url = [ + display + for display in (_coerce_run_param_to_display_name(r) for r in url_compare_runs) + if display in run_names + ] if valid_url: saved_compare = valid_url st.session_state["overview_compare_run_names"] = list(saved_compare) @@ -315,10 +360,10 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No f"Candidate ({letter})", run_dirs, index=idx, - format_func=lambda p: p.name, + format_func=get_run_display_name, key=f"compare_run_select_{i}", ) - new_compare_run_names.append(selected.name) + new_compare_run_names.append(get_run_display_name(selected)) with col_rm: if len(compare_run_names) > 1: if st.button("✕", key=f"compare_remove_{i}", help="Remove this run"): @@ -332,7 +377,7 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No st.session_state["overview_compare_run_names"] = compare_run_names if st.sidebar.button("➕ Add run", help="Add another run to compare"): - used = {run_a_dir.name} | set(compare_run_names) + used = {run_a_name} | set(compare_run_names) next_name = next((n for n in run_names if n not in used), run_names[0]) new_list = compare_run_names + [next_name] st.session_state["overview_compare_run_names"] = new_list @@ -345,13 +390,13 @@ def show_grouped_metrics_plot_multi(df_list, run_labels, group_col, label_map=No compare_run_dirs = [] if mode == "Compare Mode" and compare_run_names: - name_to_dir = {p.name: p for p in run_dirs} + name_to_dir = {get_run_display_name(p): p for p in run_dirs} compare_run_dirs = [name_to_dir[n] for n in compare_run_names if n in name_to_dir] # ====== SYNC URL (NON-DESTRUCTIVE) ====== query = { "mode": "compare" if mode == "Compare Mode" else "single", - "run_a": run_a_dir.name, + "run_a": run_a_name, } for j, name in enumerate(compare_run_names): query[f"run_{chr(98 + j)}"] = name # run_b, run_c, ... @@ -400,17 +445,93 @@ def safe_load_run(path, label='Run'): st.session_state.pop(key, None) # ====== MAIN PAGE METRICS & CHARTS ====== -_ov_entries = [("Baseline · A", path_display(runA["path"]))] +_ov_entries = [("Baseline · A", get_run_display_name(runA["path"]))] if mode == "Compare Mode" and compare_run_dirs: all_runs = st.session_state["all_runs"] run_labels = st.session_state["run_labels"] for i in range(1, len(all_runs)): - _ov_entries.append((f"Candidate · {run_labels[i]}", path_display(all_runs[i]["path"]))) + _ov_entries.append((f"Candidate · {run_labels[i]}", get_run_display_name(all_runs[i]["path"]))) render_loaded_data_section(_ov_entries) -share_q = f"mode={'compare' if mode == 'Compare Mode' else 'single'}&run_a={run_a_dir.name}" + +if mode == "Compare Mode" and compare_run_dirs: + _all_r = st.session_state.get("all_runs") + _lbls = st.session_state.get("run_labels") + if _all_r and _lbls and all(r.get("summary") is not None for r in _all_r): + _cand_stats: list[tuple[str, dict]] = [] + _overlap_rows: list[dict] = [] + _empty_labels: list[str] = [] + _invalid_msgs: list[str] = [] + for i in range(1, len(_all_r)): + cand = _lbls[i] + stt = summary_delta_overlap_stats(_all_r[0]["summary"], _all_r[i]["summary"]) + _cand_stats.append((cand, stt)) + if not stt.get("valid"): + _invalid_msgs.append(f"**{cand}:** {stt.get('error', 'Unknown error')}") + continue + join_s = " + ".join(stt["key_cols"]) + _overlap_rows.append( + { + "Candidate": cand, + "Join keys": join_s, + "Baseline rows": stt["n_rows_baseline"], + "Candidate rows": stt["n_rows_candidate"], + "Matched (Δ rows)": stt["n_matched_keys"], + "Keys only in A": stt["n_only_baseline"], + "Keys only in candidate": stt["n_only_candidate"], + } + ) + if stt["matched_empty"]: + _empty_labels.append(cand) + if _invalid_msgs: + st.warning( + "Cannot compute Summary delta alignment for some runs:\n\n" + + "\n\n".join(_invalid_msgs) + ) + if _empty_labels: + _join_cols = next( + (" + ".join(f"`{c}`" for c in s["key_cols"]) for cnd, s in _cand_stats if cnd in _empty_labels and s.get("valid")), + "`id` (or `id` + `perception_label` when both have it)", + ) + st.warning( + "**TP Summary delta views will be empty** for candidate(s) " + f"**{', '.join(_empty_labels)}**: baseline **A** and those runs share **no** overlapping " + f"Summary join keys ({_join_cols}). " + "The inner join drops every row; use **Baseline** or **Candidate** in the TP Summary sidebar, " + "or choose runs whose Summary rows use the same keys. " + "Open **Summary key overlap (delta alignment)** below for row counts and sample keys " + "that appear on only one side." + ) + with st.expander("Summary key overlap (delta alignment) — details", expanded=False): + st.markdown( + "Delta tables on **TP Summary** inner-join baseline **A** to each candidate on the " + "same keys as here: **`id`**, or **`id` + `perception_label`** when both summaries " + "include `perception_label`. Only **matched** keys produce rows; the rest are ignored." + ) + st.dataframe(pd.DataFrame(_overlap_rows), width="stretch", hide_index=True) + for cand, stt in _cand_stats: + if not stt.get("valid"): + continue + sb = stt["sample_only_baseline"] + sc = stt["sample_only_candidate"] + if not sb and not sc: + continue + st.markdown(f"**Examples — candidate {cand}**") + c1, c2 = st.columns(2) + with c1: + st.caption("Up to 5 keys only in baseline A") + st.code("\n".join(sb) if sb else "(none)") + with c2: + st.caption(f"Up to 5 keys only in {cand}") + st.code("\n".join(sc) if sc else "(none)") + +share_query = { + "mode": "compare" if mode == "Compare Mode" else "single", + "run_a": run_a_name, +} if mode == "Compare Mode" and compare_run_names: for j, name in enumerate(compare_run_names): - share_q += f"&run_{chr(98 + j)}={name}" + share_query[f"run_{chr(98 + j)}"] = name +share_q = urllib.parse.urlencode(share_query) render_share_link_callout( share_q, caption="Append to your server URL (e.g. `https://host:8501/?` + query). Build links from Data Management too.", @@ -542,3 +663,514 @@ def show_tp_mean_by_label_compare(df_list, run_labels, label_col, label_jp_map=N with st.expander("Show metric breakdowns by label", expanded=False): show_grouped_metrics_plot(df_summary, group_col="perception_label", mode="single") show_grouped_metrics_plot(df_summary, group_col="product_label", label_map=PRODUCT_LABEL_JA, mode="single") + + +st.divider() +section_header("Export Dashboard Report", "Generate a curated PDF from the current Overview selection and filters.") +_report_runs = st.session_state.get("all_runs") if mode == "Compare Mode" and compare_run_dirs else [runA] +_report_labels = st.session_state.get("run_labels") if mode == "Compare Mode" and compare_run_dirs else ["A"] +_report_filters = { + "perception_labels": filters.get("perception_labels", []), + "product_labels": filters.get("product_labels", []), +} +_report_key = { + "mode": mode, + "paths": [str(r.get("path")) for r in _report_runs], + "perception_labels": list(_report_filters["perception_labels"]), + "product_labels": list(_report_filters["product_labels"]), +} +pdf_col1, pdf_col2 = st.columns([1.2, 2.8]) +with pdf_col1: + if st.button("Generate Evaluation Dashboard Report", type="primary", use_container_width=True): + _pdf_status = st.empty() + try: + def _update_pdf_status(message: str) -> None: + _pdf_status.info(f"Generating report: {message}") + + _update_pdf_status("starting") + pdf_bytes = build_overview_pdf_report( + mode=mode, + run_records=_report_runs, + run_labels=_report_labels, + filters=_report_filters, + product_label_map=PRODUCT_LABEL_JA, + progress_callback=_update_pdf_status, + ) + st.session_state["overview_pdf_report_bytes"] = pdf_bytes + st.session_state["overview_pdf_report_key"] = _report_key + run_names_for_file = [get_run_display_name(r["path"]) for r in _report_runs if r.get("path") is not None] + st.session_state["overview_pdf_report_name"] = make_report_filename(run_names_for_file) + _pdf_status.success("PDF report is ready.") + except Exception as e: + st.session_state.pop("overview_pdf_report_bytes", None) + st.session_state.pop("overview_pdf_report_key", None) + st.session_state.pop("overview_pdf_report_name", None) + _pdf_status.error(f"PDF generation failed: {e}") +with pdf_col2: + _pdf_ready = ( + st.session_state.get("overview_pdf_report_bytes") is not None + and st.session_state.get("overview_pdf_report_key") == _report_key + ) + if _pdf_ready: + st.download_button( + "Download Evaluation Dashboard Report", + data=st.session_state["overview_pdf_report_bytes"], + file_name=st.session_state.get("overview_pdf_report_name", "overview_report.pdf"), + mime="application/pdf", + use_container_width=True, + ) + +specsheet_title = "Export Specsheet Report" +section_header( + specsheet_title, + "Generate the release-oriented spec-sheet PDF.", +) + +_specsheet_run_records = _report_runs +_specsheet_run_labels = _report_labels +_specsheet_run_options = {} +for label, record in zip(_specsheet_run_labels, _specsheet_run_records): + source_path = record["path"] + target_path = resolve_specsheet_generation_run_path(source_path) + release_context = get_release_specsheet_context(source_path) + option_label = f"{label} · {get_run_display_name(source_path)}" + if release_context is not None and target_path != source_path: + option_label = ( + f"{label} · {get_run_display_name(source_path)} " + f"(PDF body: {get_run_display_name(target_path)})" + ) + _specsheet_run_options[option_label] = { + "source_path": source_path, + "target_path": target_path, + "release_context": release_context, + } +_specsheet_run_option_keys = list(_specsheet_run_options.keys()) +_default_specsheet_run_selection = _specsheet_run_option_keys[:1] +_default_specsheet_labels = list(DEFAULT_SPECSHEET_LABELS) +_default_specsheet_project_id = st.session_state.get("specsheet_project_id", DEFAULT_SPECSHEET_PROJECT_ID) +_default_specsheet_topic = st.session_state.get("specsheet_topic_name", DEFAULT_SPECSHEET_TOPIC) +_single_specsheet_run_path = resolve_specsheet_generation_run_path(_specsheet_run_records[0]["path"]) +_default_specsheet_version = get_run_display_name(_single_specsheet_run_path) + +if mode == "Compare Mode": + selected_specsheet_run_keys = st.multiselect( + "Runs to generate spec-sheet for", + options=list(_specsheet_run_options.keys()), + default=_default_specsheet_run_selection, + key="specsheet_target_runs", + help="Spec-sheet generation is single-run, so multiple selected runs are processed one by one.", + ) +else: + selected_specsheet_run_keys = _specsheet_run_option_keys[:1] + +_selected_specsheet_entries = [ + _specsheet_run_options[key] + for key in selected_specsheet_run_keys + if key in _specsheet_run_options +] +selected_specsheet_run_paths = [] +_seen_specsheet_targets = set() +for entry in _selected_specsheet_entries: + target_path = entry["target_path"] + target_key = str(target_path.resolve()) + if target_key in _seen_specsheet_targets: + continue + selected_specsheet_run_paths.append(target_path) + _seen_specsheet_targets.add(target_key) +selected_specsheet_release_contexts = [] +_seen_specsheet_releases = set() +for entry in _selected_specsheet_entries: + release_context = entry["release_context"] + if release_context is None: + continue + release_dir = release_context.get("release_dir") + release_key = str(release_dir.resolve()) if isinstance(release_dir, Path) else str(release_dir) + if release_key in _seen_specsheet_releases: + continue + selected_specsheet_release_contexts.append(release_context) + _seen_specsheet_releases.add(release_key) +_active_specsheet_paths = [get_specsheet_artifact_paths(path) for path in selected_specsheet_run_paths] +_selected_trend_metadata_text = "" +_selected_trend_metadata_path = None +if len(selected_specsheet_release_contexts) == 1: + candidate_path = selected_specsheet_release_contexts[0].get("metadata") + if isinstance(candidate_path, Path) and candidate_path.exists(): + _selected_trend_metadata_path = candidate_path +if _selected_trend_metadata_path is None and len(_active_specsheet_paths) == 1 and _active_specsheet_paths[0]["trend_metadata"].exists(): + _selected_trend_metadata_path = _active_specsheet_paths[0]["trend_metadata"] +if _selected_trend_metadata_path is not None: + try: + _selected_trend_metadata_text = _selected_trend_metadata_path.read_text(encoding="utf-8") + except Exception: + _selected_trend_metadata_text = "" + +_selected_metadata_defaults = {} +if _selected_trend_metadata_text: + try: + _selected_metadata_defaults = parse_trend_metadata_text(_selected_trend_metadata_text) + except Exception: + _selected_metadata_defaults = {} + +def _specsheet_title_version_from_metadata(metadata: dict) -> str: + explicit = str(metadata.get("version_abbr") or "").strip() + if explicit: + return explicit + version = str(metadata.get("pilot_auto_version") or "").strip() + if version.lower().startswith("pilot.auto "): + return version[len("Pilot.Auto "):].strip() + return version + +_metadata_default_version = _specsheet_title_version_from_metadata(_selected_metadata_defaults) +if _metadata_default_version: + _default_specsheet_version = _metadata_default_version +_metadata_trend_topic = str(_selected_metadata_defaults.get("topic_name") or "").strip() +if ( + _metadata_trend_topic + and _metadata_trend_topic != DEFAULT_SPECSHEET_TOPIC + and st.session_state.get("specsheet_topic_name") == _metadata_trend_topic +): + st.session_state["specsheet_topic_name"] = DEFAULT_SPECSHEET_TOPIC + _default_specsheet_topic = DEFAULT_SPECSHEET_TOPIC + +_specsheet_defaults_source = str(_selected_trend_metadata_path or _single_specsheet_run_path) +_previous_auto_version = st.session_state.get("specsheet_version_auto_value") +_current_version = st.session_state.get("specsheet_version") +if ( + st.session_state.get("specsheet_version_auto_source") != _specsheet_defaults_source + and ( + "specsheet_version" not in st.session_state + or _current_version == _previous_auto_version + or str(_current_version or "").endswith(("/performance", "/devops")) + ) +): + st.session_state["specsheet_version"] = _default_specsheet_version +st.session_state["specsheet_version_auto_source"] = _specsheet_defaults_source +st.session_state["specsheet_version_auto_value"] = _default_specsheet_version + +_previous_auto_topic = st.session_state.get("specsheet_topic_auto_value") +_current_topic = st.session_state.get("specsheet_topic_name") +if ( + st.session_state.get("specsheet_topic_auto_source") != _specsheet_defaults_source + and ( + "specsheet_topic_name" not in st.session_state + or _current_topic == _previous_auto_topic + ) +): + st.session_state["specsheet_topic_name"] = _default_specsheet_topic +st.session_state["specsheet_topic_auto_source"] = _specsheet_defaults_source +st.session_state["specsheet_topic_auto_value"] = _default_specsheet_topic + +specsheet_cfg_col1, specsheet_cfg_col2, specsheet_cfg_col3 = st.columns([1.4, 1.2, 1.4]) +with specsheet_cfg_col1: + specsheet_project_id = st.text_input( + "Project ID", + value=_default_specsheet_project_id, + key="specsheet_project_id", + ).strip() +with specsheet_cfg_col2: + specsheet_version = st.text_input( + "Version", + value=_default_specsheet_version, + key="specsheet_version", + ).strip() +with specsheet_cfg_col3: + specsheet_topic_name = st.text_input( + "Topic name", + value=_default_specsheet_topic, + key="specsheet_topic_name", + ).strip() + +_detected_specsheet_labels = [] +for run_path in selected_specsheet_run_paths: + _detected_specsheet_labels.extend(collect_candidate_specsheet_labels(run_path)) +specsheet_labels = list(dict.fromkeys(_detected_specsheet_labels or _default_specsheet_labels)) +if specsheet_labels: + st.caption(f"Labels: all detected labels ({len(specsheet_labels)})") +if not selected_specsheet_run_paths: + st.info("Pick at least one run to build the release spec-sheet.") + +if _selected_trend_metadata_text and "specsheet_include_trend" not in st.session_state: + st.session_state["specsheet_include_trend"] = True + +_release_trend_status_text = "" +if selected_specsheet_release_contexts: + for release_context in selected_specsheet_release_contexts[:1]: + release_dir = release_context.get("release_dir") + roles = release_context.get("roles", {}) + role_status = [] + if isinstance(roles, dict): + for role_name in ("performance", "devops"): + role_info = roles.get(role_name) + if not isinstance(role_info, dict): + continue + bits = [] + bits.append("summary.json" if role_info.get("has_summary") else "no summary.json") + bits.append("metadata.yaml" if role_info.get("has_metadata") else "no metadata.yaml") + role_status.append(f"{role_name}: {', '.join(bits)}") + release_text = f"Release folder: `{path_display(release_dir)}`." if isinstance(release_dir, Path) else "Release folder detected." + if role_status: + release_text += " " + "; ".join(role_status) + "." + _release_trend_status_text = release_text + +trend_toggle_col, trend_status_col = st.columns([1.1, 2.9]) +with trend_toggle_col: + specsheet_trend_enabled = st.toggle( + "Include trend data", + value=bool(st.session_state.get("specsheet_include_trend", bool(_selected_trend_metadata_text))), + key="specsheet_include_trend", + help="Save release metadata and include available trend history.", + ) +with trend_status_col: + if specsheet_trend_enabled and _selected_trend_metadata_path is not None and _selected_trend_metadata_text: + st.caption(f"Using saved metadata: `{path_display(_selected_trend_metadata_path)}`") + elif specsheet_trend_enabled: + st.caption("No saved metadata found. Fill in release metadata below.") + if specsheet_trend_enabled and _release_trend_status_text: + st.caption(_release_trend_status_text) + +trend_metadata_payload = None +trend_metadata_changed = False +trend_metadata_change_confirmed = False +if specsheet_trend_enabled: + _trend_metadata_source_key = str(_selected_trend_metadata_path) if _selected_trend_metadata_path is not None else "__default__" + if ( + st.session_state.get("specsheet_trend_metadata_source") != _trend_metadata_source_key + or "specsheet_trend_metadata_text" not in st.session_state + ): + st.session_state["specsheet_trend_metadata_text"] = _selected_trend_metadata_text or DEFAULT_TREND_METADATA_TEXT + st.session_state["specsheet_trend_metadata_source"] = _trend_metadata_source_key + st.session_state["specsheet_confirm_metadata_changes"] = False + trend_metadata_text = st.text_area( + "Trend metadata YAML", + key="specsheet_trend_metadata_text", + height=180, + help="Required keys: tags, pilot_auto_version, data_count, description, date.", + ) + trend_metadata_changed = bool(_selected_trend_metadata_text) and ( + trend_metadata_text.strip() != _selected_trend_metadata_text.strip() + ) + if trend_metadata_changed: + st.warning("Saved metadata was edited. Confirm before generating.") + trend_metadata_change_confirmed = st.checkbox( + "Confirm saved metadata changes", + key="specsheet_confirm_metadata_changes", + ) + trend_metadata_status = st.empty() + try: + trend_metadata_payload = parse_trend_metadata_text(trend_metadata_text) + trend_metadata_status.success("Trend metadata looks valid.") + except Exception as trend_exc: + trend_metadata_status.error(f"Trend metadata error: {trend_exc}") + +_specsheet_key = { + "run_paths": [str(path) for path in selected_specsheet_run_paths], + "project_id": specsheet_project_id, + "version": specsheet_version, + "topic_name": specsheet_topic_name, + "labels": list(specsheet_labels), + "include_trend": specsheet_trend_enabled, + "trend_metadata": trend_metadata_payload if specsheet_trend_enabled else None, + "artifact_kind": "zip" if len(selected_specsheet_run_paths) > 1 else "pdf", +} +_specsheet_ready = ( + st.session_state.get("specsheet_pdf_report_bytes") is not None + and st.session_state.get("specsheet_pdf_report_key") == _specsheet_key +) + +def _release_specsheet_pdf_path(release_context: dict, topic_name: str) -> Path | None: + release_dir = release_context.get("release_dir") + if not isinstance(release_dir, Path): + return None + specsheet_root = release_dir / "specsheet" + topic = str(topic_name or "").strip() + candidates = [] + if topic: + candidates.append(specsheet_root / topic / "specsheet.pdf") + candidates.append(specsheet_root / "specsheet.pdf") + candidates.extend(sorted(specsheet_root.glob("*/*.pdf"))) + for candidate in candidates: + if candidate.exists() and not candidate.is_dir(): + return candidate + return None + +_release_specsheet_paths = [ + pdf_path + for pdf_path in ( + _release_specsheet_pdf_path(release_context, specsheet_topic_name) + for release_context in selected_specsheet_release_contexts + ) + if pdf_path is not None +] +_generated_specsheet_paths = [ + path_info["specsheet_pdf"] + for path_info in _active_specsheet_paths + if path_info["specsheet_pdf"].exists() and is_specsheet_pdf_fresh(path_info["run_dir"]) +] +_existing_specsheet_paths = _release_specsheet_paths or _generated_specsheet_paths +_all_selected_specsheet_pdfs_ready = ( + len(selected_specsheet_run_paths) > 0 + and len(_existing_specsheet_paths) == len(selected_specsheet_run_paths) +) +_specsheet_has_existing_pdf = _specsheet_ready or _all_selected_specsheet_pdfs_ready +_specsheet_action_label = ( + "Regenerate Release Spec-sheet PDF" + if _specsheet_has_existing_pdf + else "Generate Release Spec-sheet PDF" +) + +specsheet_action_col1, specsheet_action_col2 = st.columns([1.2, 2.8]) +with specsheet_action_col1: + if st.button( + _specsheet_action_label, + type="secondary" if _specsheet_has_existing_pdf else "primary", + use_container_width=True, + ): + _specsheet_status = st.empty() + _specsheet_progress = st.progress(0.0) + try: + if not specsheet_project_id: + raise ValueError("Project ID is required.") + if not specsheet_version: + raise ValueError("Version is required.") + if not specsheet_topic_name: + raise ValueError("Topic name is required.") + if not selected_specsheet_run_paths: + raise ValueError("At least one run must be selected.") + if specsheet_trend_enabled and len(selected_specsheet_run_paths) != 1: + raise ValueError("Trend-enabled release spec-sheet generation currently supports exactly one run.") + if specsheet_trend_enabled and trend_metadata_payload is None: + raise ValueError("Valid trend metadata is required when trend mode is enabled.") + if specsheet_trend_enabled and trend_metadata_changed and not trend_metadata_change_confirmed: + raise ValueError("Confirm the metadata.yaml changes before generating.") + + stage_progress = { + "Using existing up-to-date spec-sheet PDF": 1.0, + "Loading CSV files": 0.15, + "Building abstract and detail sections": 0.2, + "Validating full trend summary": 0.9, + "Saving trend metadata": 0.9, + "Collecting trend history": 0.92, + "Rendering trend plots": 0.94, + "Rendering PDF": 0.95, + "Spec-sheet PDF is ready": 1.0, + } + + def _update_specsheet_status(message: str) -> None: + fraction = None + label_fraction = progress_fraction_from_message(message) + if "[Full] Generating blocks for labels" in message and label_fraction is not None: + fraction = 0.2 + (0.7 - 0.2) * label_fraction + elif ( + "[Full] Generating annotation count blocks for labels" in message + and label_fraction is not None + ): + fraction = 0.7 + (0.9 - 0.7) * label_fraction + elif label_fraction is not None and "Processing pkl files" in message: + fraction = 0.02 + (0.12 - 0.02) * label_fraction + else: + fraction = stage_progress.get(message, 0.05) + _specsheet_progress.progress(fraction) + _specsheet_status.info(f"Generating release spec-sheet: {message}") + + generated_pdfs: list[tuple[Path, bool]] = [] + for idx, run_path in enumerate(selected_specsheet_run_paths, start=1): + _update_specsheet_status(f"Run {idx}/{len(selected_specsheet_run_paths)}: {get_run_display_name(run_path)}") + if specsheet_trend_enabled and trend_metadata_payload is not None: + if _selected_trend_metadata_path is not None and trend_metadata_changed: + _selected_trend_metadata_path.write_text( + yaml.safe_dump(trend_metadata_payload, allow_unicode=True, sort_keys=False), + encoding="utf-8", + ) + if len(selected_specsheet_release_contexts) == 1: + roles = selected_specsheet_release_contexts[0].get("roles", {}) + if isinstance(roles, dict): + for role_info in roles.values(): + if not isinstance(role_info, dict) or not role_info.get("has_summary"): + continue + role_run_dir = role_info.get("run_dir") + if isinstance(role_run_dir, Path): + write_trend_metadata(role_run_dir, trend_metadata_payload) + pdf_path, generated = generate_specsheet_pdf( + run_path, + project_id=specsheet_project_id, + version=specsheet_version, + labels=specsheet_labels, + topic_name=specsheet_topic_name, + include_trend=specsheet_trend_enabled, + trend_metadata=trend_metadata_payload, + force=True, + progress_callback=_update_specsheet_status, + ) + generated_pdfs.append((pdf_path, generated)) + + if len(generated_pdfs) == 1: + download_name = generated_pdfs[0][0].name + download_bytes = generated_pdfs[0][0].read_bytes() + download_mime = "application/pdf" + else: + zip_buffer = io.BytesIO() + with zipfile.ZipFile(zip_buffer, "w", compression=zipfile.ZIP_DEFLATED) as zf: + for pdf_path, _ in generated_pdfs: + zf.write(pdf_path, arcname=f"{pdf_path.parent.parent.name}/{pdf_path.name}") + download_name = "specsheet_reports.zip" + download_bytes = zip_buffer.getvalue() + download_mime = "application/zip" + + st.session_state["specsheet_pdf_report_bytes"] = download_bytes + st.session_state["specsheet_pdf_report_key"] = _specsheet_key + st.session_state["specsheet_pdf_report_name"] = download_name + st.session_state["specsheet_pdf_report_mime"] = download_mime + _specsheet_ready = True + _specsheet_progress.progress(1.0) + if any(generated for _, generated in generated_pdfs): + if len(generated_pdfs) == 1: + _specsheet_status.success("Release spec-sheet PDF is ready.") + else: + _specsheet_status.success("Release spec-sheet files are ready.") + else: + if len(generated_pdfs) == 1: + _specsheet_status.success("Using the existing up-to-date release spec-sheet PDF.") + else: + _specsheet_status.success("Using the existing up-to-date release spec-sheet files.") + except Exception as e: + st.session_state.pop("specsheet_pdf_report_bytes", None) + st.session_state.pop("specsheet_pdf_report_key", None) + st.session_state.pop("specsheet_pdf_report_name", None) + st.session_state.pop("specsheet_pdf_report_mime", None) + _specsheet_status.error(f"Spec-sheet generation failed: {e}") +with specsheet_action_col2: + if _specsheet_ready: + st.success("Release spec-sheet is ready.") + st.download_button( + "Download Release Spec-sheet", + data=st.session_state["specsheet_pdf_report_bytes"], + file_name=st.session_state.get("specsheet_pdf_report_name", "specsheet.pdf"), + mime=st.session_state.get("specsheet_pdf_report_mime", "application/pdf"), + use_container_width=True, + ) + elif _all_selected_specsheet_pdfs_ready: + st.success("Existing release spec-sheet is ready.") + if len(_existing_specsheet_paths) == 1: + _disk_pdf_path = _existing_specsheet_paths[0] + st.download_button( + "Download Release Spec-sheet", + data=_disk_pdf_path.read_bytes(), + file_name=_disk_pdf_path.name, + mime="application/pdf", + use_container_width=True, + ) + else: + _zip_buffer = io.BytesIO() + with zipfile.ZipFile(_zip_buffer, "w", compression=zipfile.ZIP_DEFLATED) as zf: + for pdf_path in _existing_specsheet_paths: + zf.write(pdf_path, arcname=f"{pdf_path.parent.parent.name}/{pdf_path.name}") + st.download_button( + "Download Release Spec-sheets", + data=_zip_buffer.getvalue(), + file_name="specsheet_reports.zip", + mime="application/zip", + use_container_width=True, + ) + else: + if len(selected_specsheet_run_paths) == 1: + _single_paths = _active_specsheet_paths[0] diff --git a/evaluation_dashboard_app/Readme.en.md b/evaluation_dashboard_app/Readme.en.md new file mode 100755 index 0000000..5d94cf1 --- /dev/null +++ b/evaluation_dashboard_app/Readme.en.md @@ -0,0 +1,372 @@ +# Evaluation Dashboard + +## Required Installation + +This dashboard and evaluation tool require the following prerequisites and Python packages. + +### Python packages (local development / full functionality) +The easiest way is to install from the single `requirements.txt` at the repository root, including private dependencies. + +```sh +cd evaluation_dashboard_app +pip install -r requirements.txt +``` + +Example if you want to install packages manually in separate steps: + +```sh +# Basic +pip install \ + streamlit pandas plotly duckdb numpy \ + requests pyyaml matplotlib shapely + +# Download / Scenario API authentication +pip install git+ssh://git@github.com/tier4/webauto-auth-py.git + +# Production task queue (when USE_TASK_QUEUE=true) +pip install rq psycopg2-binary +``` + +In the **Docker image**, public dependencies are installed from [`requirements-docker.txt`](requirements-docker.txt), and private packages such as `webauto-auth` and the evaluation dependencies are added during build time using SSH secrets (see [`Dockerfile`](Dockerfile)). + +PDF export uses Plotly/Kaleido static image rendering, so **Chrome is also installed in the Docker image**. If you see `Kaleido requires Google Chrome to be installed` in the deployment environment, **rebuild and redeploy** with the latest image. + +```sh +# Install CLI tool (if you use it for generating evaluation command lines) +pipx install git+ssh://git@github.com/tier4/v_and_v_util.git +``` + +### pilot-auto / perception_eval (only needed when generating Summary / Score) +- A pilot-auto environment with `perception_eval` available is required. See "Usage" below. +- If importing `perception_eval` fails, generation of `Summary.csv` / `Score.csv` stops. + +### Configuration file +- Input values are saved in `configs/autoware_evaluator_dl_config.json` (created / updated automatically). + +## Overview +This is an evaluation dashboard built with Streamlit. It reads evaluation results under `data/` (`Summary.csv`, `Score.csv`, `.parquet`) and visualizes them across multiple pages. In addition, `pages/6_Download.py` supports bulk collection of evaluation results such as `result.txt`, automatic generation of `Summary.csv` / `Score.csv`, and searching / downloading result directories. The **TLR (Traffic Light Recognition) Analysis** page can visualize criteria matrices, vehicle state vs. signal type, important zones, and more for traffic-light recognition evaluation. To use it, you must first download scenario data from **tab 2 "Download Scenarios"** on the Download page. + +## Usage + +1. To generate summary or score files from `pages/6_Download.py` ("Generate Summary.csv / Score.csv"), you must **activate the pilot-auto (ROS 2) environment in advance** with the following command: + ``` + source path_to_pilot/install/setup.sh + ``` + This step is required for "Summary / Score CSV generation" in `pages/6_Download.py`. + +2. Start Streamlit from `evaluation_dashboard_app/`. + ``` + streamlit run Overview.py + ``` + +3. Choose pages and filters from the sidebar to explore the data. + +### Visualization quick start (recommended workflow) + +The recommended flow from downloading logs for a test to generating summaries and then reviewing the details in Overview is the following three-step process: + +1. **Download the target test logs from the Download page** +2. **Generate summary / score files from "Eval Results" on the Download page** +3. **Select that log (Run) on the Overview page and inspect the details** + +Below is a summary of what to do and what to watch out for in each step. + +#### Step 1: Download logs from the Download page + +- **Page**: Open **Download** (`6_Download.py`) from the sidebar. +- **Tab**: Select **"Download Results"**. +- **Inputs**: + - Enter **Project ID** and **Job ID**. Optionally specify a Suite ID if needed. + - For **Output Path**, specify **a folder dedicated to this test**. + To make it show up as a selectable "Run" in Overview, it is recommended to place one folder per test directly under `data/`. + Example: `./data/my_test_20250203` +- **Download Type**: + - **Archives (ZIP)**: Downloads ZIP archives, extracts them, and takes data for the selected phase. Suitable for full local analysis. + - **Result JSON only**: Downloads only the result JSON. Lightweight and useful when you only want summary / score generation. +- **Run**: Click "Download Results" and wait for completion. +- **Result**: Under the specified Output Path, logs and, when needed, source files such as `result.txt` and `score.json` are stored in a directory structure based on the job / suite. + +![Download page settings (Download Results tab)](docs/images/download_config.png) + +![After download finishes](docs/images/download_result.png) + +#### Step 2: Generate summary analysis results in Eval Results + +- **Page**: Stay on the same **Download** page. +- **Tab**: Switch to **"Eval Results (per directory)"** or **"Eval Results"**. +- **Root directory to evaluate**: + - Specify **the same path used as Output Path in Step 1**. + Example: `./data/my_test_20250203` +- **Options**: + - **Search subdirectories**: Searches subdirectories for `result.txt` / `score.json`. Usually this should be enabled. + - **Only generate Summary.csv and Score.csv**: + If each directory already contains `result.txt` or `score.json`, enabling this skips re-running `perception_eval` and generates **only `Summary.csv` and `Score.csv`** from the existing results. + On the first run, if `result.txt` and related outputs do not exist yet, leave this unchecked and run the full evaluation with "Run eval_result for all directories". +- **Run**: + - Click either "Run eval_result for all directories" or "Generate Summary and Score CSV only". +- **Result**: **`Summary.csv` and `Score.csv`** are generated directly under the specified root directory. + These files are the "summary analysis results" used by Overview and pages such as TP Summary and Criteria Based Score. + +![Eval Results tab (summary / score generation)](docs/images/eval_result.png) + +If `perception_eval` is used during Summary / Score generation, you must run `source path_to_pilot/install/setup.sh` in advance as described in "Usage". + +#### Step 3: Select the log in Overview and inspect the details + +- **Page**: Open **Overview** (`Overview.py`) from the sidebar. +- **Selecting a Run**: + - Overview treats **each direct subdirectory under `data/`** as one "Run". + - If the Output Path in Step 1 was `./data/`, that `` appears in the sidebar dropdown for **"Baseline (A)"**. + - Choose the log (Run) you want to inspect in **Baseline (A)**. + If you want to compare runs, switch to **Compare Mode** and choose another Run in **Candidate (B)**. +- **Displayed contents**: + - Overall metrics based on the selected Run's **Summary.csv** are shown, such as TP mean and XRMS / YRMS / XSTD / YSTD. + - By filtering with Perception Label / Product Label, you can inspect label-specific TP and metric breakdowns. + - Other pages such as TP Summary, Criteria Based Score, Detection Stats, and Bounding Box Viewer share the Run selected in Overview through `st.session_state`, so it is best to **select the Run in Overview first** and then move to the detailed pages. + +![Overview page (Run selection and metrics display)](docs/images/overview.png) + +**Key point**: +- Whenever you add a new test, use `./data/` as the Output Path in Download, then use that same path in Eval Results to generate Summary / Score. The new test will appear in the Overview Run list, and you can inspect it immediately. + +## Main Features +- Select a Run on the Overview page, switch between single-run and compare mode, and display overall metrics +- When the production task queue is enabled, track heavy jobs from the UI such as "Recent tasks" +- TP / position / velocity statistical viewers (scatter plots and distributions) +- Criteria-based evaluation viewer (metric distributions, averages, and box plots) +- Detection statistics comparison viewer (for example TP / FP distance-bin comparison) +- BEV bounding-box visualization +- TLR (Traffic Light Recognition) evaluation analysis: criteria matrices, vehicle state vs. signal type, important zones. Requires scenario data downloaded from tab 2 of the Download page. +- Evaluation command generation tool +- **Docker production**: Navigate from Overview to **Deployment debug** (Postgres / Redis / RQ and optional Docker operations) + +## Directory Structure +```text +evaluation_dashboard_app/ + Overview.py + pages/ + 1_TP_Summary.py … 10_Help.py, 99_Deployment_Debug.py (sidebar order follows the page numbers) + lib/ + worker/ # Production: RQ tasks and worker entrypoint + configs/ + autoware_evaluator_dl_config.json + deploy/ # Production: compose, nginx, numbered shell steps + docker-compose.yml + .env.example + 01_SETUP_ENV.sh ... 09_RESTART_WORKER.sh + configs/ + autoware_evaluator_dl_config.json # Mounted inside the container at /app/docker_config during compose runs + nginx/ + data/ + / + Summary.csv + Score.csv + *.parquet +``` + +## Page Guide + +The sidebar order follows the numbering of **`number_name.py` files directly under `pages/`**. **Deployment debug** (`99_Deployment_Debug.py`) must stay directly under `pages/` because it is registered through `st.page_link`. Outside Docker, `inject_app_page_styles` hides that sidebar item with CSS. Inside Docker, there is an explicit link from **Overview**. + +Many visualization pages rely on `st.session_state`, so it is best to **select the mode (single / compare) and Run in Overview first**. In compare mode, Baseline (A) and Candidate (B...) are shared across pages. + +### `Overview.py` (entry point) +- Starting point for **shared filters** such as single / compare mode, Run selection, and Perception / Product labels. +- **Shareable URL**: The same view can be reproduced using query parameters like `mode`, `run_a`, `run_b`, and so on. Some other pages follow the same pattern. +- When running in Docker, the sidebar shows a link to **Deployment debug** (`pages/99_Deployment_Debug.py`). + +### `pages/1_TP_Summary.py` +- **Prerequisite**: Data must already be loaded in Overview. **`Summary.csv` is required**. If a Run does not have it, TP Summary is unavailable, while Detection Stats / BB Viewer can still work with only parquet files and show guidance accordingly. +- In compare mode, **deltas between runs** can be reflected in plots. +- `TP` range, velocity outlier clipping, scatter plots (`xrms`-`yrms`, `vx`-`vy`), and distribution histograms. + +### `pages/2_Criteria_Based_Score.py` +- A criteria evaluation viewer based on **`Score.csv`**. Follows the mode selected in Overview. +- Criteria block switching, metric distributions, group averages, box plots, and scenario-level comparisons. +- Includes UI for **Absolute gates** (sign-off by threshold pass / fail) and gate comparison across multiple Runs. + +### `pages/3_Detection_Stats.py` +- Aggregates detection evaluation data using **`.parquet` + DuckDB**. Supports filters, hierarchical views, scenario breakdown, and **comparison across multiple Runs** when Overview is in compare mode. +- Distance-bin comparison by status such as TP / FP and color schemes for perception diffs (improved / worsened). + +### `pages/4_Bounding_Box_Viewer.py` +- **Prerequisite**: A Run must already be selected in Overview. +- Displays bounding boxes on a **BEV** from `.parquet`. Supports filtering by t4dataset, topic, label, visibility, and more. In compare mode, it can handle multiple Runs. + +### `pages/5_Tools.py` +- Evaluation command generation tool +- Extract Job ID / Suite ID from Report / Suite URLs + +### `pages/6_Download.py` +- Main integration point with the evaluator. The **tabs** are organized as follows: + + | Tab | Contents | + |------|------| + | **Download Results** | Retrieve job results such as archive ZIPs or Result JSON. Output Path is restricted under the data root. | + | **Download Scenarios** | Download scenario data. Required by **TLR Analysis**. | + | **View Downloads** | Review downloaded jobs and scenarios. | + | **Eval Results** | Run evaluation or generate **Summary.csv / Score.csv** from `result.txt` / `score.json` under a root directory. | + +- When **`USE_TASK_QUEUE=true`** (Redis + Worker + Postgres), heavy work is queued to workers, and you can track status from the UI through **Recent tasks** and related sections. + +### `pages/7_Data_Management.py` +- Displays the list of Runs under the data root, including size, update time, and whether Summary / Score / Parquet files exist. +- Download outputs as a **ZIP**, copy **share links** for Overview, and **delete** Runs to manage storage in a multi-user server environment. + +### `pages/8_Parquet_Debug.py` +- For development and troubleshooting. Reads **`.parquet` / `.pkl` / `result.json`** from file paths and shows schemas, keys, criteria state, and optional quick plots. +- Useful for debugging pipeline outputs inside the dashboard. + +### `pages/9_TLR_Analysis.py` +- **TLR (Traffic Light Recognition)** evaluation: criteria matrices, vehicle state vs. signal type, important zones, and more. Supports single / compare mode and **shareable URLs** such as `mode`, `path_a`, `path_b`. +- **Prerequisite**: Download scenario data from **Download Scenarios** on the **Download** page and select the TLR result directory as a Run. + +### `pages/10_Help.py` +- Displays the repository **README inside the app** so setup instructions, workflows, and documentation can be read directly in the browser. +- Since **Mermaid diagrams** in Markdown are not rendered by default in Streamlit, this page renders them with JavaScript (Mermaid.js). + +### `pages/99_Deployment_Debug.py` (Docker only) +- Available only when Streamlit is running **inside a container**. With local `streamlit run`, it stops at a guidance message. +- Because it must be registered as **`pages/*.py` directly under the folder** for `st.page_link`, the corresponding auto-navigation item is **hidden with CSS outside Docker**. In Docker, you can also open it from the **Overview** sidebar via "Deployment debug". +- Lets you inspect the state of Postgres / Redis / RQ, task counts, and, depending on configuration, the host Docker container list, recent logs, and restricted `docker exec`. +- In production, mounting the **Docker socket grants strong privileges**, so check the authentication, VPN, and `EVAL_DEPLOYMENT_DEBUG_*` settings in [docs/PRODUCTION_DEPLOYMENT.md](docs/PRODUCTION_DEPLOYMENT.md). + +## Data Formats (high level) +- `Summary.csv`: `id`, `TP`, `xstd`, `xrms`, `ystd`, `yrms`, `vx`, `vy`, `perception_label`, `product_label` +- `Score.csv`: Criteria evaluation metric blocks (`Scenario`, `Option`, `GT_OBJ`, then `criteria0..n`) +- `.parquet`: Fields used for detection statistics / bounding-box viewing, such as `x`, `y`, `length`, `width`, `yaw`, `label`, `source`, `status` + +# Docker Usage Guide + +The image is **ROS-based**, so the container environment matches the host ROS environment. + +### Build Steps + +Because private repositories (`tier4/webauto-auth-py`, `tier4/v_and_v_util`) are used, you must provide a **GitHub SSH key** during build time. +Use `~/.ssh/id_rsa` directly. No ssh-agent is required. + +```sh +cd evaluation_dashboard_app + +# Recommended: add --no-cache if you want to rebuild with the latest dependencies every time. +# If ROS is Humble (can be omitted) +docker build --no-cache --secret id=ssh,src=$HOME/.ssh/id_rsa -t evaluation-dashboard . + +# If you want to switch ROS_DISTRO to Iron / Jazzy etc. +docker build --build-arg ROS_DISTRO=iron --secret id=ssh,src=$HOME/.ssh/id_rsa -t evaluation-dashboard . +``` + +### Production deployment + +For multi-user / production use, the recommended setup is **Nginx -> Streamlit -> Redis (task queue) -> Worker -> Postgres**. Heavy jobs such as downloads, evaluation, Summary / Score CSV generation, and parquet generation are executed by workers instead of the UI process, and task state is stored in Postgres. + +**Target Architecture:** + +```mermaid +flowchart LR + subgraph clients [Clients] + Browser[Browser] + end + subgraph edge [Edge] + Nginx[Nginx] + end + subgraph app [App Tier] + S1[Streamlit 1] + S2[Streamlit 2] + end + subgraph infra [Infrastructure] + Redis[Redis] + Postgres[Postgres] + end + subgraph workers [Workers] + W1[Worker 1] + W2[Worker N] + end + Browser --> Nginx + Nginx --> S1 + Nginx --> S2 + S1 --> Redis + S2 --> Redis + S1 --> Postgres + S2 --> Postgres + Redis --> W1 + Redis --> W2 + W1 --> Postgres + W2 --> Postgres + W1 --> DataRoot[Data root] + W2 --> DataRoot +``` + +- **Build**: As described above in "Build Steps", run `docker build ... -t evaluation-dashboard .` in `evaluation_dashboard_app/`. The compose services `streamlit1` (default), optional `streamlit2` (`--profile ha`), and `worker` all use this image. +- **Recommended flow (`deploy/` numbered scripts)**: Move into `deploy/` and run the scripts in order. All of them use `docker compose --env-file .env`. + + | Script | Description | + |-----------|------| + | `01_SETUP_ENV.sh` | Create `.env` from `.env.example` if it does not exist. **You still edit it manually.** | + | `02_BUILD.sh` | Build the image. You can pass arguments such as `--no-cache`. | + | `03_INIT_DB.sh` | **First time only**: after Postgres starts, run `init_db` to create task tables. | + | `04_START.sh` | Start the stack. Default worker count comes from `.env` `EVAL_COMPOSE_SCALE_WORKER`; for example `./04_START.sh --scale worker=3` overrides it. | + | `05_STOP.sh` | Stop the stack. | + | `06_STATUS.sh` | Check service status. | + | `07_LOGS.sh` | Run `docker compose logs -f`. Without arguments it shows all services; for example `./07_LOGS.sh worker`. | + | `08_REBUILD_AND_START.sh` | Build and then start the stack, same startup behavior as `04_START.sh`. | + | `09_RESTART_WORKER.sh` | Restart workers so code changes are reflected on the worker side. | + +- **Manual setup is also possible**: `cd deploy && cp .env.example .env` -> edit `.env` -> `docker compose --env-file .env up -d`. For first-time setup only, run `docker compose --env-file .env run --rm init_db` (equivalent to `03_INIT_DB.sh`). +- **Access**: In production compose, **Nginx listens on port 80**, and Streamlit is accessed through the proxy (see `docker-compose.yml` / `nginx/nginx.conf`). Since the source code and `lib/` are mounted, **Streamlit reloads easily when files change**, but **workers must be restarted after Python code changes**. +- **If the UI keeps loading forever**: Streamlit communicates with the browser over **WebSocket**. Suggested checks: (1) do a **hard reload** including cache reset or reopen in another tab, (2) by default Nginx points only to **one Streamlit app** (`streamlit1`), and a second instance should be enabled only when needed with `docker compose --profile ha up -d` plus upstream changes in `nginx.conf`, (3) set **`STREAMLIT_SERVER_COOKIE_SECRET`** in `deploy/.env.example`, (4) use `.streamlit/config.toml` `enableWebsocketCompression = false` and Nginx `proxy_buffering off` plus suitable `proxy_*_timeout`, and (5) check logs with `docker compose logs streamlit1 nginx`. +- **502 Bad Gateway**: This happens when Nginx **cannot reach Streamlit** because the process exited, was killed by OOM, or stayed blocked for too long. Check `docker compose logs streamlit1` and host **`dmesg`** for OOM messages. Heavy pages can consume significant memory, so the **default single-instance setup** and the single upstream in `deploy/nginx/nginx.conf` are recommended. +- **Troubleshooting Detection Stats freezes / 502**: Set **`EVAL_DETECTION_STATS_DEBUG=1`** in `.env` so it is passed into the compose `streamlit1` service, then restart. The **Detection Stats debug** expander at the bottom of the page and the stderr of **`docker compose logs streamlit1`** will show section boundaries, `getrusage` memory values, and elapsed time before / after DuckDB calls. +- **If a subpage says "load in Overview" even though Overview was already opened**: Session state is stored **in memory per replica**. Overview also syncs `mode` / `run_a` / `run_b`... into the URL, so when those query parameters remain in the address bar, subpages such as Detection Stats can **rebuild `run_a` into `runA`** via `lib/overview_url_hydrate.py`. Open **Overview once**, confirm the address bar contains `run_a=`, then move to the subpage, or reopen from the **Overview share link**. +- **Avoid duplicate config management**: During compose runs, `deploy/configs/autoware_evaluator_dl_config.json` is mounted inside the container as `EVAL_DASHBOARD_CONFIG` (`/app/docker_config/...`). This is a separate file from the host `configs/` version, so edit the one under `deploy/configs/` for Docker-specific settings. +- For detailed settings and environment variables, see [docs/PRODUCTION_DEPLOYMENT.md](docs/PRODUCTION_DEPLOYMENT.md). + +### Startup and data mount (single container) + +Always mount the `data/` directory so data is persisted and visible. + +```sh +docker run -p 8501:8501 \ + -v "$(pwd)/data:/app/data" \ + -v ~/.webauto:/root/.webauto \ + evaluation-dashboard +``` + +### Example: run in background (`-d`) + +If you want to start the container in detached mode, add `-d` and optionally set `--name`. If you want to synchronize the entire `/app` tree, including code and notebooks, with the host, use the following form. + +```sh +docker run -d --name evaluation-dashboard \ + -p 8501:8501 \ + -v "$(pwd):/app" \ + -v ~/.webauto:/root/.webauto \ + evaluation-dashboard +``` + +### Multi-user deployment + +If multiple people access the same server for downloads, evaluation, result review, sharing, and data management, refer to the following points. + +- **Data root**: You can set the evaluation data root with environment variable `EVAL_DASHBOARD_DATA_ROOT` (default is `data`). Example: `-e EVAL_DASHBOARD_DATA_ROOT=/var/eval_dashboard/data` +- **Path restriction**: The Download Output Path and Eval Root directory are restricted under this data root, and path traversal is rejected. +- **Data Management page**: Lets you view the Run list, show sizes, delete Runs, and copy share links. You can remove unnecessary Runs to manage disk usage. +- **Sharing results**: By adding `?mode=...&run_a=...&run_b=...` to the Overview URL, you can share the same Run view. Links can be copied from Data Management or "Share this view" in Overview. +- See [docs/MULTI_USER_DEPLOYMENT.md](docs/MULTI_USER_DEPLOYMENT.md) for more details. + +### Debugging and shell access + +If you want shell access inside a running container, use one of the following methods. + +**1. Enter bash by container ID** +```sh +docker ps # check the [CONTAINER ID] +docker exec -it [CONTAINER ID] /bin/bash +``` + +**2. Start directly with bash as the entrypoint** +```sh +docker run -it --entrypoint bash \ + -v "$(pwd)/data:/app/data" \ + evaluation-dashboard +``` diff --git a/evaluation_dashboard_app/Readme.md b/evaluation_dashboard_app/Readme.md old mode 100644 new mode 100755 index add4149..02c2aa9 --- a/evaluation_dashboard_app/Readme.md +++ b/evaluation_dashboard_app/Readme.md @@ -29,6 +29,8 @@ pip install rq psycopg2-binary **Docker イメージ**では、公開依存は [`requirements-docker.txt`](requirements-docker.txt) で入り、ビルド時の SSH シークレットで webauto-auth・評価系のプライベートパッケージを追加インストールします([`Dockerfile`](Dockerfile) 参照)。 +PDF エクスポートでは Plotly/Kaleido の静的画像化を使うため、**Docker イメージ内に Chrome もインストール**されます。デプロイ環境で `Kaleido requires Google Chrome to be installed` が出た場合は、最新のイメージへ **再 build / 再 deploy** してください。 + ```sh # Install CLI tool (評価実行コマンド生成で利用する場合) pipx install git+ssh://git@github.com/tier4/v_and_v_util.git @@ -295,7 +297,7 @@ flowchart LR W2 --> DataRoot ``` -- **ビルド**: 上記「ビルド手順」のとおり `evaluation_dashboard_app/` で `docker build ... -t evaluation-dashboard .`(compose の `streamlit` / `worker` はこのイメージを参照します)。 +- **ビルド**: 上記「ビルド手順」のとおり `evaluation_dashboard_app/` で `docker build ... -t evaluation-dashboard .`(compose の `streamlit1`(既定)・任意の `streamlit2`(`--profile ha`)・`worker` はこのイメージを参照します)。 - **推奨フロー(`deploy/` の番号付きスクリプト)**: `deploy/` に移動して順に実行します(すべて `docker compose --env-file .env` を使います)。 | スクリプト | 内容 | @@ -303,15 +305,19 @@ flowchart LR | `01_SETUP_ENV.sh` | `.env` が無ければ `.env.example` から作成(**編集は手動**) | | `02_BUILD.sh` | イメージビルド(引数で `--no-cache` など可) | | `03_INIT_DB.sh` | **初回のみ**: Postgres 起動後に `init_db` でタスク用テーブル作成 | - | `04_START.sh` | スタック起動(例: `./04_START.sh --scale worker=3`) | + | `04_START.sh` | スタック起動(デフォルト worker 数は `.env` の `EVAL_COMPOSE_SCALE_WORKER`、例: `./04_START.sh --scale worker=3` で上書き可) | | `05_STOP.sh` | 停止 | | `06_STATUS.sh` | 状態確認 | | `07_LOGS.sh` | `docker compose logs -f`(省略時は全サービス、例: `./07_LOGS.sh worker`) | - | `08_REBUILD_AND_START.sh` | ビルド後に `up -d` | + | `08_REBUILD_AND_START.sh` | ビルド後に `04_START.sh` と同じ起動(worker 既定本数あり) | | `09_RESTART_WORKER.sh` | ワーカー再起動(コード変更を worker に反映) | - **手動でも同じことは可能**: `cd deploy && cp .env.example .env` → `.env` を編集 → `docker compose --env-file .env up -d`。初回のみ `docker compose --env-file .env run --rm init_db`(`03_INIT_DB.sh` と同等)。 - **アクセス**: 本番 compose では **Nginx がポート 80**、Streamlit はプロキシ経由(`docker-compose.yml` / `nginx/nginx.conf` 参照)。ソースや `lib/` はマウントされているため **Streamlit はファイル変更でリロード**しやすい一方、**ワーカーは Python 変更後に再起動**が必要です。 +- **UI がずっとロード中になるとき**: Streamlit はブラウザと **WebSocket** でつながります。対処の目安: (1) **ハードリロード**(キャッシュ削除込み)や別タブで開き直す。(2) **既定は Streamlit アプリ 1 台**(`streamlit1`)のみ Nginx が向き先にしています。2 台目が必要な場合のみ `docker compose --profile ha up -d` と `nginx.conf` の upstream 追記を参照。(3) compose で **`STREAMLIT_SERVER_COOKIE_SECRET`**(`deploy/.env.example`)。(4) **`.streamlit/config.toml`** の `enableWebsocketCompression = false` と Nginx の **`proxy_buffering off`** / `proxy_*_timeout`。(5) ログ: `docker compose logs streamlit1 nginx`。 +- **502 Bad Gateway**: Nginx が **Streamlit に繋がらない**ときに出ます(プロセス落ち・OOM・長時間ブロックで切断など)。`docker compose logs streamlit1` とホストの **`dmesg`(OOM)** を確認。重いページはメモリを食うため、**既定の 1 台構成**と `deploy/nginx/nginx.conf` の単一 upstream を推奨します。 +- **Detection Stats のフリーズ / 502 切り分け**: `.env` に **`EVAL_DETECTION_STATS_DEBUG=1`**(compose の `streamlit1` に渡る)を入れて再起動。ページ下部の **Detection Stats debug** 展開と **`docker compose logs streamlit1`** の stderr に、セクション境界・`getrusage` メモリ・DuckDB 前後の経過時間が出ます。 +- **サブページで「Overview で読み込み」と出るのに Overview は済んでいるとき**: セッション状態は **レプリカごとのメモリ**にあります。Overview は URL に `mode` / `run_a` / `run_b`…を同期するため、**同じ URL のクエリが付いたまま**ならサブページ(Detection Stats など)が **`run_a` から `runA` を再構築**します(`lib/overview_url_hydrate.py`)。一度 **Overview を開いて**アドレスバーに `run_a=` があることを確認してからサブページへ進むか、または **Overview の共有リンク**から開き直してください。 - **設定の二重管理を避ける**: compose 実行時は `deploy/configs/autoware_evaluator_dl_config.json` がコンテナ内 `EVAL_DASHBOARD_CONFIG`(`/app/docker_config/...`)としてマウントされます。ホストの `configs/` とは別ファイルなので、Docker 用に変えたい値はこちらを編集します。 - 詳細・環境変数一覧は [docs/PRODUCTION_DEPLOYMENT.md](docs/PRODUCTION_DEPLOYMENT.md) を参照してください。 diff --git a/evaluation_dashboard_app/catalogs.json b/evaluation_dashboard_app/catalogs.json new file mode 100755 index 0000000..4270cc5 --- /dev/null +++ b/evaluation_dashboard_app/catalogs.json @@ -0,0 +1,38 @@ +[ + { + "display_name": "Build Test Catalog", + "catalog_id": "bd0569ec-9826-44ac-8780-45b4cea624e6", + "description": "Try this catalog for testing build integration", + "integration_id": "900d2096-a112-48f0-a65e-27e122aad86a" + }, + { + "display_name": "Performance Test", + "catalog_id": "e36d75b9-6c3a-4970-9b9b-5cd13f7a9da3", + "description": "To calculate perception metrics and create metrics report", + "integration_id": "96ad8fba-0228-4c2b-9166-07d4de1a0760" + }, + { + "display_name": "Old performance test", + "catalog_id": "e2efe01d-e0c6-4d49-8223-817ff5d73204", + "description": "Run perception metrics test we have done previously", + "integration_id": "6126e86f-615f-4b84-9643-91b88db606bd" + }, + { + "display_name": "Devops Test", + "catalog_id": "ab0f8498-cc1b-4726-836f-e18e8bcb3200", + "description": "Edge case for devops integration", + "integration_id": "295cff78-9bc9-4d60-b7aa-f95be6ff96a4" + }, + { + "display_name": "Usecase Performance Catalog", + "catalog_id": "09039022-ec91-41bf-9e93-fdefccdfc9bc", + "description": "[WIP] Run evaluation based on planning scene catalog.", + "integration_id": "51f89d37-5c65-4449-9add-8971d0a79a7a" + }, + { + "display_name": "L4 regression test", + "catalog_id": "14b1d54b-5c9f-4cbf-a7e1-0eebceb1d30f", + "description": "[WARN] This is a regression test for L4, please do not use it for other purposes", + "integration_id": "c5f58b3c-8974-4f33-a8fa-e1f443320cfd" + } +] diff --git a/evaluation_dashboard_app/deploy/.env b/evaluation_dashboard_app/deploy/.env index 1bfde14..75a1ec2 100644 --- a/evaluation_dashboard_app/deploy/.env +++ b/evaluation_dashboard_app/deploy/.env @@ -17,6 +17,10 @@ DATABASE_URL=postgresql://eval_user:eval_pass@postgres:5432/eval_dashboard REDIS_URL=redis://redis:6379/0 RQ_QUEUE=default +# T4 visualizer server base URL used by Bounding Box Viewer / T4 pages +# For Docker-on-Linux, host.docker.internal is mapped via docker-compose extra_hosts +T4_VISUALIZER_BASE_URL=http://10.0.6.148:8000 + # Optional: per-user task visibility (company auth / WebAutoAuth) # Header name set by auth proxy with current user id (e.g. X-Forwarded-User) # AUTH_USER_HEADER=X-Forwarded-User @@ -26,4 +30,5 @@ RQ_QUEUE=default # NGINX_HTTPS=1 EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT=deploy -EVAL_DEPLOYMENT_DEBUG_EXEC=1 \ No newline at end of file +EVAL_DEPLOYMENT_DEBUG_EXEC=1 +EVAL_COMPOSE_SCALE_WORKER=3 \ No newline at end of file diff --git a/evaluation_dashboard_app/deploy/.env.example b/evaluation_dashboard_app/deploy/.env.example index 1add32b..50d5a65 100644 --- a/evaluation_dashboard_app/deploy/.env.example +++ b/evaluation_dashboard_app/deploy/.env.example @@ -17,6 +17,19 @@ DATABASE_URL=postgresql://eval_user:eval_pass@postgres:5432/eval_dashboard REDIS_URL=redis://redis:6379/0 RQ_QUEUE=default +# T4 visualizer server base URL used by Bounding Box Viewer / T4 pages +# In Docker, set to a host-reachable endpoint (compose maps host.docker.internal) +T4_VISUALIZER_BASE_URL=http://host.docker.internal:8000 + +# Docker Compose: default worker replica count (04_START.sh / 08_REBUILD_AND_START.sh). Streamlit defaults to streamlit1 only; optional second app server: compose --profile ha (see docker-compose.yml + nginx.conf). +EVAL_COMPOSE_SCALE_WORKER=2 + +# Same secret on both Streamlit containers (session cookies / multi-replica). Compose sets a dev default; override in production: openssl rand -hex 32 +# STREAMLIT_SERVER_COOKIE_SECRET= + +# Detection Stats page: stderr timing logs + debug expander (docker compose logs streamlit1) +# EVAL_DETECTION_STATS_DEBUG=1 + # RQ: max job runtime before the worker kills the job (seconds). Default 7 days if unset. # RQ_JOB_TIMEOUT_SEC=604800 # Optional: longer timeout for build_parquet only (defaults to RQ_JOB_TIMEOUT_SEC if unset) diff --git a/evaluation_dashboard_app/deploy/.streamlit/config.toml b/evaluation_dashboard_app/deploy/.streamlit/config.toml new file mode 100644 index 0000000..14d8726 --- /dev/null +++ b/evaluation_dashboard_app/deploy/.streamlit/config.toml @@ -0,0 +1,13 @@ +# Streamlit project config (used for local `streamlit run` and Docker WORKDIR=/app). +# See https://docs.streamlit.io/develop/api-reference/configuration/config.toml + +[server] +# Local default: open browser when running outside Docker +headless = false + +# Behind nginx or other proxies, per-message WebSocket compression can break or stall +# some setups (see Streamlit troubleshooting: "App is not loading when running remotely"). +enableWebsocketCompression = false + +# cookieSecret: MUST be identical on every Streamlit replica behind a load balancer. +# Set via environment in Docker: STREAMLIT_SERVER_COOKIE_SECRET (see deploy/docker-compose.yml). diff --git a/evaluation_dashboard_app/deploy/04_START.sh b/evaluation_dashboard_app/deploy/04_START.sh index e087e35..451c209 100755 --- a/evaluation_dashboard_app/deploy/04_START.sh +++ b/evaluation_dashboard_app/deploy/04_START.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -# 04 — Start the full stack, or if it is already running: up -d (apply compose/scale) then restart all services. -# Extra args: e.g. ./04_START.sh --scale worker=3 +# 04 — Start or update the full stack with docker compose up -d. +# Default: 2 worker replicas (EVAL_COMPOSE_SCALE_WORKER in .env). Override: ./04_START.sh --scale worker=1 (last --scale wins). set -euo pipefail DEPLOY_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$DEPLOY_DIR" @@ -9,12 +9,16 @@ if [[ ! -f .env ]]; then exit 1 fi +set -a +# shellcheck disable=SC1091 +source .env +set +a +WORKER_SCALE="${EVAL_COMPOSE_SCALE_WORKER:-2}" + dc() { docker compose --env-file .env "$@"; } -if [[ -n "$(dc ps -q --status running 2>/dev/null || true)" ]]; then - echo "Stack already running — updating with up -d, then restarting all services." - dc up -d "$@" - dc restart -else - dc up -d "$@" -fi +dc up -d --scale "worker=${WORKER_SCALE}" "$@" + +# Nginx resolves Docker service names at startup. Recreate it after Streamlit is +# up so it remounts the current nginx.conf and cannot keep a stale container IP. +dc up -d --no-deps --force-recreate nginx diff --git a/evaluation_dashboard_app/deploy/08_REBUILD_AND_START.sh b/evaluation_dashboard_app/deploy/08_REBUILD_AND_START.sh index b216686..2763b55 100755 --- a/evaluation_dashboard_app/deploy/08_REBUILD_AND_START.sh +++ b/evaluation_dashboard_app/deploy/08_REBUILD_AND_START.sh @@ -8,4 +8,4 @@ if [[ ! -f .env ]]; then exit 1 fi docker compose --env-file .env build "$@" -docker compose --env-file .env up -d +exec "$DEPLOY_DIR/04_START.sh" diff --git a/evaluation_dashboard_app/deploy/docker-compose.yml b/evaluation_dashboard_app/deploy/docker-compose.yml index 8fd4b87..54fee77 100644 --- a/evaluation_dashboard_app/deploy/docker-compose.yml +++ b/evaluation_dashboard_app/deploy/docker-compose.yml @@ -1,8 +1,10 @@ -# Production-style stack: Nginx -> Streamlit, Redis, Worker, Postgres. +# Production-style stack: Nginx -> Streamlit (default: one app replica) + Workers (default scale via EVAL_COMPOSE_SCALE_WORKER in .env / 04_START.sh). +# Second Streamlit: optional HA profile — `docker compose --profile ha up -d` and uncomment streamlit2 in deploy/nginx/nginx.conf upstream. # Helper scripts (run from deploy/): 01_SETUP_ENV.sh 02_BUILD.sh 03_INIT_DB.sh 04_START.sh 05_STOP.sh # 06_STATUS.sh 07_LOGS.sh 08_REBUILD_AND_START.sh 09_RESTART_WORKER.sh # Run from deploy/: docker compose --env-file .env up -d -# Scale workers: docker-compose up -d --scale worker=3 (default 1 worker) +# Plain `up -d` uses one worker unless you pass --scale worker=N; 04_START.sh defaults to EVAL_COMPOSE_SCALE_WORKER (2). +# More Streamlit boxes: duplicate x-streamlit-app block as streamlit3, add server to nginx upstream. # Build image from repo root: docker build -t evaluation-dashboard . (see Readme) # # Data is bind-mounted to the host so you can access it directly: @@ -11,7 +13,7 @@ # - ${HOME}/.webauto -> Download/Scenario API credentials (streamlit + worker) # # App source is mounted so you can edit Python code without rebuilding the image. -# Streamlit will reload on file changes. Restart the worker to pick up changes: docker compose restart worker +# Streamlit will reload on file changes. Restart workers: docker compose restart worker # # Deployment debug (pages/99_Deployment_Debug.py; nav hidden outside Docker via CSS; sidebar link on Overview in Docker): Streamlit mounts the host # Docker socket and sets EVAL_DEPLOYMENT_DEBUG_DOCKER=1. Anyone who can use the dashboard @@ -19,6 +21,66 @@ # networks. Set EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT in .env (see .env.example). # EVAL_DEPLOYMENT_DEBUG_EXEC=1 in .env enables one-shot shell (docker exec) from the UI. +x-streamlit-app: &streamlit-app + build: + context: .. + dockerfile: Dockerfile + secrets: + - ssh + image: evaluation-dashboard + command: ["/app/docker-entrypoint.sh"] + environment: + - TZ=Asia/Tokyo + # Same value on streamlit1 + streamlit2 so session cookies validate behind nginx (see .streamlit/config.toml). + # Override in .env for production (e.g. openssl rand -hex 32). + - STREAMLIT_SERVER_COOKIE_SECRET=${STREAMLIT_SERVER_COOKIE_SECRET:-evaluationdashboard-streamlit-cookie-secret-change-in-production} + # Verbose stderr logs + timing expander on Detection Stats page (see lib/detection_stats_debug.py) + - EVAL_DETECTION_STATS_DEBUG=${EVAL_DETECTION_STATS_DEBUG:-0} + - EVAL_DASHBOARD_DATA_ROOT=${EVAL_DASHBOARD_DATA_ROOT:-/app/data} + - EVAL_DASHBOARD_CONFIG=/app/docker_config/autoware_evaluator_dl_config.json + - USE_TASK_QUEUE=${USE_TASK_QUEUE:-true} + - DATABASE_URL=${DATABASE_URL} + - REDIS_URL=${REDIS_URL:-redis://redis:6379/0} + - RQ_QUEUE=${RQ_QUEUE:-default} + - EVAL_DEPLOYMENT_DEBUG_DOCKER=1 + - EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT=${EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT:-} + # One-shot shell in selected container (docker exec). Default off; set to 1 in .env only when needed. + - EVAL_DEPLOYMENT_DEBUG_EXEC=${EVAL_DEPLOYMENT_DEBUG_EXEC:-0} + volumes: + - ../data:/app/data + - ${HOME}/.webauto:/root/.webauto + # Docker-only config (separate from your local configs) + - ./configs:/app/docker_config + # Mount app source so code changes apply without rebuild (Streamlit auto-reloads) + - ../Overview.py:/app/Overview.py + - ../docker-entrypoint.sh:/app/docker-entrypoint.sh + - ../catalogs.json:/app/catalogs.json + - ../pages:/app/pages + - ../Readme.md:/app/Readme.md + - ../Readme.en.md:/app/Readme.en.md + - ../lib:/app/lib + - ../worker:/app/worker + - ../configs:/app/configs + - ../static:/app/static + - ../.streamlit:/app/.streamlit + - /var/run/docker.sock:/var/run/docker.sock + extra_hosts: + - "host.docker.internal:host-gateway" + env_file: + - .env + depends_on: + redis: + condition: service_started + postgres: + condition: service_healthy + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:8501/_stcore/health >/dev/null || curl -fsS http://localhost:8501/healthz >/dev/null"] + interval: 5s + timeout: 3s + retries: 30 + start_period: 20s + restart: unless-stopped + services: nginx: image: nginx:alpine @@ -27,55 +89,27 @@ services: volumes: - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro depends_on: - - streamlit - restart: unless-stopped - - streamlit: - build: - context: .. - dockerfile: Dockerfile - secrets: - - ssh - image: evaluation-dashboard - command: ["/app/docker-entrypoint.sh"] - environment: - - TZ=Asia/Tokyo - - EVAL_DASHBOARD_DATA_ROOT=${EVAL_DASHBOARD_DATA_ROOT:-/app/data} - - EVAL_DASHBOARD_CONFIG=/app/docker_config/autoware_evaluator_dl_config.json - - USE_TASK_QUEUE=${USE_TASK_QUEUE:-true} - - DATABASE_URL=${DATABASE_URL} - - REDIS_URL=${REDIS_URL:-redis://redis:6379/0} - - RQ_QUEUE=${RQ_QUEUE:-default} - - EVAL_DEPLOYMENT_DEBUG_DOCKER=1 - - EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT=${EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT:-} - # One-shot shell in selected container (docker exec). Default off; set to 1 in .env only when needed. - - EVAL_DEPLOYMENT_DEBUG_EXEC=${EVAL_DEPLOYMENT_DEBUG_EXEC:-0} - volumes: - - ../data:/app/data - - ${HOME}/.webauto:/root/.webauto - # Docker-only config (separate from your local configs) - - ./configs:/app/docker_config - # Mount app source so code changes apply without rebuild (Streamlit auto-reloads) - - ../Overview.py:/app/Overview.py - - ../pages:/app/pages - - ../Readme.md:/app/Readme.md - - ../lib:/app/lib - - ../worker:/app/worker - - ../configs:/app/configs - - /var/run/docker.sock:/var/run/docker.sock - env_file: - - .env - depends_on: - redis: - condition: service_started - postgres: + streamlit1: condition: service_healthy restart: unless-stopped + streamlit1: + <<: *streamlit-app + + streamlit2: + <<: *streamlit-app + profiles: + - ha + redis: image: redis:7-alpine restart: unless-stopped - + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 3s + retries: 5 + worker: build: context: .. @@ -98,11 +132,15 @@ services: - ./configs:/app/docker_config # Mount app source so code changes apply without rebuild (restart worker to pick up: docker compose restart worker) - ../Overview.py:/app/Overview.py + - ../catalogs.json:/app/catalogs.json - ../pages:/app/pages - ../Readme.md:/app/Readme.md + - ../Readme.en.md:/app/Readme.en.md - ../lib:/app/lib - ../worker:/app/worker - ../configs:/app/configs + extra_hosts: + - "host.docker.internal:host-gateway" env_file: - .env depends_on: @@ -144,4 +182,4 @@ services: secrets: ssh: - file: ${HOME}/.ssh/id_rsa \ No newline at end of file + file: ${HOME}/.ssh/id_rsa diff --git a/evaluation_dashboard_app/deploy/nginx/nginx.conf b/evaluation_dashboard_app/deploy/nginx/nginx.conf index a4766dd..de69355 100644 --- a/evaluation_dashboard_app/deploy/nginx/nginx.conf +++ b/evaluation_dashboard_app/deploy/nginx/nginx.conf @@ -1,32 +1,40 @@ # Nginx: reverse proxy to Streamlit with WebSocket support. -# For multiple Streamlit replicas, add more "server streamlit:8501" lines in upstream. - +# +# Default upstream is streamlit1 only. A second replica (streamlit2) is optional in docker-compose +# (profile "ha"); if you add it, duplicate the server line below and use ip_hash for sticky sessions. +# Pointing nginx at a dead/crashed upstream yields 502 — single replica reduces RAM pressure and failure modes. events { - worker_connections 1024; + worker_connections 2048; } http { - upstream streamlit { - server streamlit:8501; - # Add more servers for load balancing: - # server streamlit2:8501; - # server streamlit3:8501; - } + # Docker's embedded DNS. Resolve Streamlit at request time so nginx does not keep + # a stale container IP after `docker compose up -d` recreates streamlit1. + resolver 127.0.0.11 valid=10s ipv6=off; server { listen 80; server_name _; + client_max_body_size 200m; + location / { - proxy_pass http://streamlit; + set $streamlit_upstream streamlit1:8501; + proxy_pass http://$streamlit_upstream; proxy_http_version 1.1; + proxy_buffering off; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; + proxy_connect_timeout 60s; + proxy_send_timeout 86400; proxy_read_timeout 86400; + # Large Streamlit responses / occasional upstream quirks + proxy_buffer_size 128k; + proxy_buffers 8 256k; } } } diff --git a/evaluation_dashboard_app/docker-entrypoint.sh b/evaluation_dashboard_app/docker-entrypoint.sh old mode 100644 new mode 100755 index c37a1b5..a3c26ac --- a/evaluation_dashboard_app/docker-entrypoint.sh +++ b/evaluation_dashboard_app/docker-entrypoint.sh @@ -5,4 +5,4 @@ if [[ -n "${ROS_DISTRO}" && -f "/opt/ros/${ROS_DISTRO}/setup.bash" ]]; then source "/opt/ros/${ROS_DISTRO}/setup.bash" fi -exec streamlit run Overview.py --server.address=0.0.0.0 --server.port=8501 "$@" +exec streamlit run Overview.py --server.address=0.0.0.0 --server.port=8501 --server.headless=true --server.enableStaticServing=true "$@" diff --git a/evaluation_dashboard_app/docs/PRODUCTION_DEPLOYMENT.md b/evaluation_dashboard_app/docs/PRODUCTION_DEPLOYMENT.md index 3e84af6..4bc22fc 100644 --- a/evaluation_dashboard_app/docs/PRODUCTION_DEPLOYMENT.md +++ b/evaluation_dashboard_app/docs/PRODUCTION_DEPLOYMENT.md @@ -72,6 +72,7 @@ Heavy operations (download results, download scenarios, run eval_result, generat | `EVAL_DEPLOYMENT_DEBUG_DOCKER` | Set to `1` in [`deploy/docker-compose.yml`](deploy/docker-compose.yml) for Streamlit; enables the **Docker** tab when the host socket is mounted. Override in `.env` only if you change compose. | `1` in compose | | `EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT` | Compose project name (`docker compose ls`) to filter containers by `com.docker.compose.project`. Strongly recommended when the host runs other stacks. | (empty) | | `EVAL_DEPLOYMENT_DEBUG_EXEC` | When `1`/`true`, the Deployment debug **Docker** tab shows **Run command** (`sh -c` via `docker exec`). Default `0` in compose — enable in `.env` only briefly on trusted networks. | `0` | +| `EVAL_COMPOSE_SCALE_WORKER` | Default number of `worker` replicas when using [`deploy/04_START.sh`](deploy/04_START.sh) / [`08_REBUILD_AND_START.sh`](deploy/08_REBUILD_AND_START.sh). | `2` | ## Build @@ -113,26 +114,23 @@ docker compose build --no-cache docker compose up -d ``` - To run multiple workers, use `--scale worker=N` (e.g. 3 workers): + The stack defaults to **two Streamlit** containers (`streamlit1`, `streamlit2`) behind Nginx and **two workers** (`EVAL_COMPOSE_SCALE_WORKER=2` in `.env`, applied by [`04_START.sh`](deploy/04_START.sh)). Override worker count with `--scale worker=N` (last flag wins) or change `EVAL_COMPOSE_SCALE_WORKER`. ```sh - docker-compose up -d --scale worker=3 + docker compose up -d --scale worker=3 ``` - Default is one worker. All worker replicas share the same RQ queue. + All worker replicas share the same RQ queue. 4. **Access the app** - Via Nginx: **http://localhost** (port 80) - - Streamlit directly (if you expose it): port 8501 on the `streamlit` service (not exposed by default when using Nginx) + - Streamlit directly (if you expose ports in compose): 8501 on `streamlit1` / `streamlit2` (not exposed by default when using Nginx) ## Scaling -- **Workers**: Use Docker Compose `--scale` to run more worker containers. From the `deploy/` directory: - - **Default (1 worker):** `docker-compose up -d` - - **N workers:** `docker-compose up -d --scale worker=N` - Example: `docker-compose up -d --scale worker=3` runs three workers; all consume from the same RQ queue. -- **Streamlit replicas**: In `deploy/docker-compose.yml`, duplicate the `streamlit` service (e.g. `streamlit2`) and add `server streamlit2:8501;` to `deploy/nginx/nginx.conf` in the `upstream streamlit` block. +- **Workers**: Default replica count is `EVAL_COMPOSE_SCALE_WORKER` (see `.env.example`; [`04_START.sh`](deploy/04_START.sh) passes `--scale worker=…`). From the `deploy/` directory you can also run `docker compose up -d --scale worker=N` (e.g. three workers); all consume from the same RQ queue. +- **Streamlit replicas**: By default, `streamlit1` and `streamlit2` share one Nginx `upstream` with `ip_hash` for session stickiness. To add more, duplicate the `x-streamlit-app` service in [`deploy/docker-compose.yml`](deploy/docker-compose.yml), add `depends_on` for Nginx, and add `server streamlit3:8501;` (etc.) in [`deploy/nginx/nginx.conf`](deploy/nginx/nginx.conf). ## TLS (HTTPS) @@ -151,18 +149,18 @@ To serve over HTTPS, configure Nginx with SSL certificates (e.g. Let's Encrypt) | "Failed to enqueue task" | `REDIS_URL` and `DATABASE_URL` are set; Redis and Postgres containers are running; `USE_TASK_QUEUE=true`. | | Tasks stay "pending" | Worker container is running; same `REDIS_URL` and `RQ_QUEUE` as Streamlit; worker logs for errors. | | Postgres connection refused | Postgres is healthy (`docker-compose ps`); `DATABASE_URL` uses hostname `postgres` and correct port (5432). | -| Nginx 502 Bad Gateway | Streamlit container is up and listening on 8501; Nginx `upstream` points to `streamlit:8501`. | +| Nginx 502 Bad Gateway | Streamlit containers are up and listening on 8501; Nginx `upstream` lists `streamlit1:8501` and `streamlit2:8501`. | ## Deployment debug page (Docker socket) The Streamlit page **Deployment debug** (`pages/99_Deployment_Debug.py` — required at top level so `st.page_link` works; default sidebar entry is hidden outside Docker via CSS; **Overview** adds a sidebar link when running in Docker) shows redacted environment variables, Postgres/Redis/RQ checks, task counts, and Docker container status and log tails. -- [`deploy/docker-compose.yml`](deploy/docker-compose.yml) mounts `/var/run/docker.sock` into the `streamlit` service and sets `EVAL_DEPLOYMENT_DEBUG_DOCKER=1`. After `docker compose up -d`, restart or recreate Streamlit if you change compose or env. +- [`deploy/docker-compose.yml`](deploy/docker-compose.yml) mounts `/var/run/docker.sock` into each Streamlit service (`streamlit1`, `streamlit2`) and sets `EVAL_DEPLOYMENT_DEBUG_DOCKER=1`. After `docker compose up -d`, restart or recreate those services if you change compose or env. - Set `EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT` in `.env` to your Compose project name (from `docker compose ls`) so the UI lists only this stack’s containers. If it is unset, the page lists every container visible to the daemon and shows a warning. -- Rebuild the image after adding the `docker` PyPI package to `requirements-docker.txt` (or `docker compose build streamlit`). +- Rebuild the image after adding the `docker` PyPI package to `requirements-docker.txt` (or `docker compose build streamlit1`). - **Exec**: set `EVAL_DEPLOYMENT_DEBUG_EXEC=1` in `.env` and recreate Streamlit to enable one-shot `sh -c` commands in the selected container (same power as `docker exec`). Leave at `0` when you only need logs. -**Risk**: any user who can open the app with socket access can read logs for containers matched by the filter. With `EVAL_DEPLOYMENT_DEBUG_EXEC=1`, they can also run shell commands inside those containers. Restrict access with VPN, SSO/auth proxy, or remove the socket mount and debug env from the `streamlit` service in compose if that risk is unacceptable. +**Risk**: any user who can open the app with socket access can read logs for containers matched by the filter. With `EVAL_DEPLOYMENT_DEBUG_EXEC=1`, they can also run shell commands inside those containers. Restrict access with VPN, SSO/auth proxy, or remove the socket mount and debug env from the Streamlit services in compose if that risk is unacceptable. ## Data on the host (bind mounts) @@ -198,7 +196,7 @@ Rebuild the image only when you change dependencies (e.g. `requirements-docker.t ``` deploy/ - docker-compose.yml # full stack; streamlit includes Docker socket for Deployment debug + docker-compose.yml # full stack; streamlit1/streamlit2 + Docker socket for Deployment debug .env.example nginx/ nginx.conf diff --git a/evaluation_dashboard_app/docs/guide/data_reports.html b/evaluation_dashboard_app/docs/guide/data_reports.html new file mode 100644 index 0000000..a9a5d14 --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/data_reports.html @@ -0,0 +1,245 @@ + + + + + + Evaluation Dashboard Data and Reports + + + +
+
+
Data and Reports
+

Artifacts

+

+ The dashboard is driven by files. Understanding which file powers which page makes the app much easier to use and debug. +

+
+
+ + +
+
+
+
+
Run Model
+

A run is a folder under the data root.

+

+ The default data root is data/. In production it can be changed with + EVAL_DASHBOARD_DATA_ROOT. Download and Eval paths are restricted under this root. +

+
+
+
+
data/
+
my_test_20250203/
+
Summary.csv
+
Score.csv
+
result.txt / score.json / logs...
+
current.parquet / future.parquet
+
resources/metadata.yaml + summary.json
+
specsheet/specsheet.pdf
+
+
+

Why one folder per test?

+

+ It keeps Overview selection simple, makes Data Management safer, and lets users share links using stable run names. + If output is scattered across arbitrary folders, users cannot easily know what to select or delete. +

+
+
+
+
+ +
+
+
+
Core Files
+

Which artifact powers which page?

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ArtifactCreated ByUsed ByMeaning
Summary.csvDownload -> Eval ResultsOverview, TP SummaryObject-level summary metrics such as TP, x/y RMS, x/y STD, velocity, perception labels, and product labels.
Score.csvDownload -> Eval ResultsCriteria Based ScoreCriteria block metrics including scenario, optional dataset ID, option, GT object, criteria label, NM, TP/TN, ADD, AIL, UIL, PFN/PFP, Practical Pass Rate, thresholds, and counts.
.parquetDownload/eval/parquet build workflowsDetection Stats, Bounding Box Viewer, Prediction Evaluation, DebugStructured frame/object rows: position, dimensions, yaw, label, status, source, scenario metadata, and prediction metrics.
metadata.yaml + summary.jsonSpecsheet/trend generation or analyzer outputTrend Insights, Specsheet trend exportRelease identity and trend summary payloads. Summary shape decides full/usecase/devops role.
+
+
+ +
+
+
+
Score.csv Structure
+

Score.csv is the Criteria page source.

+

+ Each row describes one scenario result and then repeats the criteria metric block for each + available criteria range. +

+
+ + + + + + + + + + + + + + + + + + + +
PartFieldsHow the dashboard uses it
Row identityScenario, optional DatasetUsed for scenario filters, scenario leaderboards, compare joins, gates, and PDF tables.
Scenario contextOption, GT_OBJUsed for grouping charts and understanding the matching policy/object class behind the row.
Criteria blockDistance, NM, TP/TN, ADD, AIL, UIL, PFN/PFP, UUID Num, Practical Pass Rate, MAX_DIST_THRESH, OBJ_CNTSUsed by Criteria Based Score for distributions, deltas, absolute gates, and scenario-level ranking.
+
+ Tip: if two rows share a scenario name but have different dataset IDs, the app keeps + them separate in Criteria comparisons and gate summaries. +
+
+
+ +
+
+
+
Pass Metrics
+

TP is not the same metric as Practical Pass Rate.

+
+ + + + + + + + + + + + + + + + +
Dashboard labelSourceCalculation / meaningUsed by
TP, TP meanSummary.csvComes from summarize_ratio() as TP rate. AIL and ADD are not added to this metric.Overview, TP Summary, dashboard PDF TP sections.
pass_rate, Pass rate meanScore.csv Practical Pass Rate(TP/TN + ADD + AIL) / NM * 100. AIL and ADD are pass-side outcomes for this practical score.Criteria Based Score, absolute pass/fail gates, Criteria PDF sections.
+
+ Important: when reviewing pass/fail gates, read “pass rate” as + Practical Pass Rate, not the traditional TP rate from Summary.csv. +
+
+
+ +
+
+
+
Trend Data
+

Trend summaries are classified by JSON shape.

+

+ Trend Insights scans the data root for metadata.yaml files that have sibling summary.json. + It then classifies the summary and groups related jobs into releases. +

+
+
+
+

Full performance

+

Summary has blocks containing the header 全数データセット評価. Used for mAP, precision, recall, error, and prediction trends.

+
+
+

Usecase

+

Summary has blocks containing ユースケース評価. It participates in release grouping and inventory.

+
+
+

DevOps pass-rate

+

Summary is a nested dictionary without blocks, with category results containing passed and total.

+
+
+
+
+ +
+
+
+
Report Outputs
+

Reports are optional outputs, not the main app path.

+

+ Users can explore directly in Streamlit, then export when they need a portable artifact for review. +

+
+
+
+

Dashboard PDF

+

Generated from the current Overview selection and filters. Best for summarizing the dashboard state as a curated report.

+
+
+

Release Specsheet PDF

+

Advanced release-oriented report generated through perception_catalog_analyzer. It can include trend pages when trend metadata is enabled.

+ +
+
+

ZIP outputs

+

Data Management can package outputs for download, useful when moving run artifacts out of a shared server.

+
+
+
+
+ +
+
+
+
Debugging by Artifact
+

When a page is empty, first check the file it needs.

+
+ + + + + + + + + +
SymptomLikely missingFix
Overview summary is sparseSummary.csvGenerate Summary.csv from Download -> Eval Results.
Criteria page has no rowsScore.csvGenerate Score.csv from result files or score JSON.
Detection/BEV pages cannot loadParquet filesBuild or place parquet artifacts under the expected data root/run path.
Trend Insights has no releasesmetadata.yaml + summary.jsonGenerate or copy trend-compatible release outputs under data root.
Specsheet trend section says no dataTrend rows or PNG plotsCheck trend classification and generated plot files in specsheet/.
+
+
+
+ + + + diff --git a/evaluation_dashboard_app/docs/guide/deployment.html b/evaluation_dashboard_app/docs/guide/deployment.html new file mode 100644 index 0000000..19b9c23 --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/deployment.html @@ -0,0 +1,190 @@ + + + + + + Evaluation Dashboard Deployment Guide + + + +
+
+
Deployment and Operations
+

Deploy

+

+ Run locally for development. Use the production compose stack when multiple people need a shared server, + background workers, task history, and operational visibility. +

+
+
+ + +
+
+
+
+
Local Development
+

Fastest way to run the app.

+

+ Local mode is best for development, one-person analysis, and debugging. Heavy jobs run in the Streamlit process + unless task queue environment variables are enabled. +

+
+
+
+
local start
+
cd evaluation_dashboard_app
+pip install -r requirements.txt
+streamlit run Overview.py
+
+
+

Local prerequisites

+
    +
  • Python packages from requirements.txt.
  • +
  • pilot-auto / perception_eval environment only when generating Summary/Score.
  • +
  • Evaluator API credentials when using Download pages.
  • +
  • Chrome availability for some static image/PDF export flows.
  • +
+
+
+
+
+ +
+
+
+
Single Docker Container
+

Portable app container for simple usage.

+
+
+
+

Build image

+

Private dependencies may require passing a GitHub SSH key as a Docker build secret.

+
+
build
+
docker build --no-cache \
+  --secret id=ssh,src=$HOME/.ssh/id_rsa \
+  -t evaluation-dashboard .
+
+
+
+

Run with persistent data

+

Always mount the data directory so runs survive container restarts.

+
+
run
+
docker run -p 8501:8501 \
+  -v "$(pwd)/data:/app/data" \
+  -v ~/.webauto:/root/.webauto \
+  evaluation-dashboard
+
+
+
+
+
+ +
+
+
+
Production Stack
+

Nginx to Streamlit to Redis workers to Postgres.

+

+ In production, heavy operations should not block Streamlit. The app enqueues jobs to Redis, workers execute them, + and Postgres stores task state for Recent Tasks and operational visibility. +

+
+
+
BrowserTeam users open the shared app.
+
NginxReverse proxy, optional TLS/load balancing.
+
StreamlitUI, filters, enqueue requests, task status.
+
Redis + WorkerRQ queue and heavy background jobs.
+
Postgres + DataTask metadata and shared run artifacts.
+
+
+
recommended numbered scripts
+
cd deploy
+./01_SETUP_ENV.sh       # create .env if missing, then edit manually
+./02_BUILD.sh --no-cache
+./03_INIT_DB.sh         # first time only
+./04_START.sh           # start nginx, streamlit, redis, postgres, workers
+./06_STATUS.sh          # inspect service status
+./07_LOGS.sh worker     # tail logs for a service
+
+
+
+ +
+
+
+
Environment Variables
+

The settings that matter most.

+
+ + + + + + + + + + + +
VariablePurpose
EVAL_DASHBOARD_DATA_ROOTShared evaluation data root. Streamlit and workers must see the same path.
USE_TASK_QUEUEEnable Redis/RQ worker mode. Recommended for production.
DATABASE_URLPostgres task metadata connection string.
REDIS_URLRedis queue connection string.
RQ_JOB_TIMEOUT_SECLong timeout for downloads/eval jobs; default is intentionally much longer than RQ's built-in default.
EVAL_DASHBOARD_CONFIGDocker-specific JSON config path mounted from deploy/configs/.
EVAL_DEPLOYMENT_DEBUG_EXECEnables Docker exec from Deployment Debug. Keep off unless briefly needed on a trusted network.
+
+
+ +
+
+
+
Multi-User Operation
+

A shared server, not per-user accounts.

+

+ The app is designed as a local-team tool. Everyone who can access the server can see shared data and use server-side API credentials. +

+
+
+

Shared data

All run folders under the data root are visible to all users.

+

Path safety

Download and eval paths are resolved under the data root; traversal is rejected.

+

Shared credentials

Download API credentials are mounted server-side, not entered by each user.

+

Share links

Users share Overview URLs with mode, run_a, and run_b.

+
+
+ Access control lives outside the app: use VPN, firewall, SSO proxy, or network controls if the server should only be reachable by your team. +
+
+
+ +
+
+
+
Operations Checklist
+

What to check when production feels unhealthy.

+
+ + + + + + + + + + +
IssueCheck
Failed to enqueue taskConfirm Redis, Postgres, USE_TASK_QUEUE=true, and matching URLs.
Tasks stay pendingWorker is running, same RQ_QUEUE, worker logs show no import/config errors.
Nginx 502Streamlit is listening on 8501, not OOM-killed, and Nginx upstream matches service names.
Subpage forgets Overview stateUse Overview share link with run_a query params, especially with multiple Streamlit replicas.
Detection Stats freezesSet EVAL_DETECTION_STATS_DEBUG=1 and inspect section timing/memory output.
PDF Chrome/Kaleido errorRebuild the image so Chrome is installed in the Docker environment.
+
+
+
+ + + + diff --git a/evaluation_dashboard_app/docs/guide/getting_started.html b/evaluation_dashboard_app/docs/guide/getting_started.html new file mode 100644 index 0000000..4940f26 --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/getting_started.html @@ -0,0 +1,206 @@ + + + + + + How to Use the Evaluation Dashboard + + + +
+
+
User Workflow
+

How to Use

+

+ This is the practical path for users: get evaluator data, generate artifacts, view a run, + compare candidate results, and share the exact dashboard state with teammates. +

+
+
+ + +
+
+
+
+
Workflow 1
+

Download evaluator results into a run folder.

+

+ A run is normally one direct subdirectory under data/. Use one folder per test so + it is easy to select in Overview and safe to delete later. +

+
+
+
Open Workflow or DownloadUse Evaluator Workflow for the guided path, or pages/6_Download.py for manual tabs.
+
Select Download ResultsEnter Project ID, Job ID, and optional Suite ID.
+
Choose Output PathRecommended: data/<test_name>.
+
Pick Download TypeArchives for full local analysis, Result JSON only for lightweight summary generation.
+
Run DownloadWait for completion or watch Recent Tasks when queue mode is enabled.
+
+
+
+ Download Results configuration +
Use a dedicated output folder under the data root. That folder becomes the run you select later.
+
+
+

Decision: Archives or Result JSON only?

+
    +
  • Archives (ZIP): best for complete local investigation, eval_result, parquet generation, and visual inspection.
  • +
  • Result JSON only: faster and lighter. Good when you mainly need downloaded result JSON, not full local analysis.
  • +
  • Scenario downloads: use the Download Scenarios tab when TLR Analysis needs scenario data.
  • +
+
+
+
+
+ +
+
+
+
Workflow 2
+

Generate Summary.csv and Score.csv.

+

+ Most analysis pages need generated CSV artifacts. Stay on Download, switch to Eval Results, + and point the root directory to the same folder you used as the download output path. +

+
+
+
+
1
+

Root directory to evaluate

+

Use the same path, for example data/my_test_20250203. This keeps generated artifacts next to the run.

+
+
+
2
+

Search subdirectories

+

Usually enable this. It lets the app find result.txt or score.json in job/suite subfolders.

+
+
+
3
+

Choose generation mode

+

If results already exist, generate only Summary/Score. If not, run full eval_result generation.

+
+
+
+ Score.csv identity: Criteria pages identify rows by scenario. When a + Dataset field is available, scenario and dataset are treated together so repeated + scenario names from different datasets stay separate. +
+
+
+ Eval Results screen +
Eval Results produces the CSVs consumed by Overview, TP Summary, and Criteria pages.
+
+
+ Environment note: when generation uses perception_eval, activate the pilot-auto ROS environment first: +
+
before running generation
+
source path_to_pilot/install/setup.sh
+
+
+
+
+
+ +
+
+
+
Workflow 3
+

Select the run in Overview and explore.

+

+ Overview is the state hub. Many pages use the run selection and compare mode from Overview, + so users should start there before opening detail pages. +

+
+
+
+

Single-run review

+
    +
  1. Open Overview.
  2. +
  3. Select Single Mode.
  4. +
  5. Choose your run as Baseline (A).
  6. +
  7. Apply Perception Label or Product Label filters if needed.
  8. +
  9. Move to TP Summary, Criteria, Detection Stats, Bounding Box Viewer, or Prediction Evaluation.
  10. +
+
+
+ Overview screen +
Overview gives the first read: summary metrics, filters, report export, and links to specialized pages.
+
+
+
+
+ +
+
+
+
Workflow 4
+

Compare baseline A against candidate B.

+

+ Compare mode lets users answer the product question: did this candidate improve, regress, + or change behavior in a specific slice? +

+
+
+
+

How to set up compare mode

+
    +
  1. Open Overview.
  2. +
  3. Switch to Compare Mode.
  4. +
  5. Select Baseline (A), usually the current accepted run.
  6. +
  7. Select Candidate (B), usually the new run.
  8. +
  9. Check the summary metric deltas before going deeper.
  10. +
+
+
+

Where compare mode is most useful

+
    +
  • TP Summary: TP and kinematic metric deltas.
  • +
  • Criteria Score: Practical Pass Rate changes and absolute gate comparison.
  • +
  • Detection Stats: TP/FP distance-bin and status distribution differences.
  • +
  • Bounding Box Viewer: spatial inspection across runs.
  • +
  • Prediction Evaluation: ADE/FDE delta matrices and distance bins.
  • +
+
+
+
+ Sharing: Overview stores mode and run choices in URL query parameters such as + ?mode=compare&run_a=old_run&run_b=new_run. Copy that link to let another user open the same comparison. +
+
+
+ + +
+ + + + diff --git a/evaluation_dashboard_app/docs/guide/guide.js b/evaluation_dashboard_app/docs/guide/guide.js new file mode 100644 index 0000000..196ab83 --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/guide.js @@ -0,0 +1,10 @@ +const observer = new IntersectionObserver((entries) => { + entries.forEach((entry) => { + if (entry.isIntersecting) { + entry.target.classList.add("in"); + observer.unobserve(entry.target); + } + }); +}, { threshold: 0.12 }); + +document.querySelectorAll(".reveal").forEach((el) => observer.observe(el)); diff --git a/evaluation_dashboard_app/docs/guide/index.html b/evaluation_dashboard_app/docs/guide/index.html new file mode 100644 index 0000000..154bb61 --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/index.html @@ -0,0 +1,142 @@ + + + + + + Evaluation Dashboard Guide + + + +
+
+
+
Dashboard Documentation
+

Evaluation Dashboard
Guide

+

+ This guide is the entry point for using, debugging, extending, and deploying the + evaluation dashboard. The home page gives the system map; each substantial topic lives in + its own focused chapter. +

+ +
+
+

Guide Structure

+
    +
  • Home: system role, ownership map, and chapter routing.
  • +
  • Workflow: Download -> Eval Results -> Overview -> Compare.
  • +
  • Page Guide: page-by-page artifact and state contracts.
  • +
  • Data/Reports: run artifacts, trend data, dashboard PDF, specsheet.
  • +
  • Deployment: local, Docker, production, task queue, multi-user operations.
  • +
  • Diagrams: real sequence/system diagrams for key flows.
  • +
+
+
+
+ + + +
+
+
+
+
System Role
+

The dashboard turns evaluator outputs into explorable review evidence.

+

+ It reads run folders under the configured data root, generates dashboard artifacts when + needed, shares selected run state across Streamlit pages, and provides local or production + workflows for comparison, report generation, T4 visualization, and release trend review. +

+
+
+
Overview.pyRun selection, compare mode, filters, share links, dashboard PDF, and specsheet entry.
+
pages/Numbered Streamlit pages. Filename order is part of the navigation contract.
+
lib/Data loading, plotting, reporting, T4 clients, task queue integration, and shared UI utilities.
+
deploy/Docker Compose, Nginx, Redis/RQ workers, Postgres, and production scripts.
+
+
+
+ +
+ +
+ +
+
+
+
Primary Flow
+

The common operational path is still one clear chain.

+

+ The detailed instructions live in the Workflow chapter, but the mental model is simple: + create or choose a run folder, generate the dashboard artifacts, select run A in Overview, + optionally select candidate B, then use the dedicated pages for deeper analysis. +

+ +
+
+
DownloadProject/Job/Suite results into a run folder.
+
Eval ResultsGenerate Summary, Score, and parquet artifacts.
+
OverviewSelect run A and synchronize state.
+
CompareAdd candidate B when needed.
+
Detail PagesInvestigate the specific signal.
+
+
+
+
+ +
+
+ Evaluation Dashboard Guide +

This home page routes to the detailed chapters instead of duplicating them.

+
+
+ + + diff --git a/evaluation_dashboard_app/docs/guide/pages.html b/evaluation_dashboard_app/docs/guide/pages.html new file mode 100644 index 0000000..3f19d17 --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/pages.html @@ -0,0 +1,198 @@ + + + + + + Evaluation Dashboard Page Guide + + + +
+
+
Page-by-Page Guide
+

Pages

+

+ A detailed guide to every dashboard page: what it needs, what it shows, how it behaves in compare mode, + and when users should open it. +

+
+
+ + +
+
+
+
+
State Model
+

Start in Overview, then go deep.

+

+ Overview sets mode, selected runs, and shared filters. Detail pages often read those values from + st.session_state, so opening Overview first prevents confusing “please load data” messages. +

+
+
+
OverviewSelect run A, optional run B, labels, and mode.
+
Shared stateThe app stores run objects and filters in session state.
+
Detail pagesPages read the active run and specialize the analysis.
+
URL sharingOverview can encode mode and run names into query params.
+
Team reviewUsers open the same linked comparison on the shared server.
+
+
+
+ +
+
+
+
Core Pages
+

The pages most users touch first.

+
+
+
+

Overview

+

Use when: starting any review, choosing runs, comparing A/B, exporting dashboard PDFs, or generating release specsheets.

+
    +
  • Inputs: run folders under data root, Summary.csv, labels, compare mode.
  • +
  • Shows: summary metrics, label/product filters, A/B charts, dashboard report export, specsheet export.
  • +
  • Watch out: if a run has no Summary.csv, high-level summary metrics are limited.
  • +
+
+
+

Download

+

Use when: acquiring evaluator results, scenario data, or generating Summary/Score artifacts.

+
    +
  • Tabs: Download Results, Download Scenarios, View Downloads, Eval Results.
  • +
  • Outputs: downloaded archives, result JSON, scenario data, Summary.csv, Score.csv.
  • +
  • Score.csv: contains scenario identity, optional dataset ID, criteria blocks, and Practical Pass Rate.
  • +
  • Queue behavior: with USE_TASK_QUEUE=true, heavy tasks run in workers and appear in Recent Tasks.
  • +
+
+
+

Evaluator Workflow

+

Use when: you want a more guided operational flow for local runs, background tasks, fresh evaluator pipelines, and report reuse.

+
    +
  • Good for: launching longer evaluator workflows without jumping between many manual steps.
  • +
  • Outputs: downloaded artifacts, optional eval_result, Summary.csv, Score.csv, optional parquet, and report assets.
  • +
  • Depends on: evaluator API configuration, task queue for long-running jobs in production.
  • +
+
+
+

Data Management

+

Use when: managing a shared server or cleaning up old run outputs.

+
    +
  • Shows: run folders, sizes, modified time, Summary/Score/parquet presence.
  • +
  • Actions: create share links, download ZIP outputs, delete run folders under the data root.
  • +
  • Safety: deletion is restricted to run-level directories under the data root.
  • +
+
+
+
+
+ +
+
+
+
Metric Analysis Pages
+

Turn CSV and parquet artifacts into review signals.

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PagePrerequisiteMain UseCompare Behavior
TP SummarySummary.csvTP rate, RMS/STD, velocity scatter, metric distribution, density, scenario delta ranking.Shows candidate-vs-baseline deltas such as Delta TP and metric shifts.
Criteria Based ScoreScore.csvCriteria block selection, Practical Pass Rate distribution, group means, box plots, absolute gates.Compares Practical Pass Rate changes, gate pass/fail status, and per-scenario deltas. Uses Scenario + Dataset when Dataset exists.
Detection StatsParquet filesTP/FP/FN rates, distance bins, status distribution, object counts, label and scenario breakdown.Side-by-side and delta-oriented detection metrics across selected runs.
Prediction EvaluationPrediction parquet/artifactsSpecsheet-aligned ADE/FDE, label matrices, distance bins, polar/radial breakdowns.ADE/FDE delta matrix and per-distance comparisons between A and B.
Trend InsightsTrend metadata.yaml + summary.jsonRelease inventory, mAP trend, prediction trend, pass-rate trend, defect evaluation, metric atlas.Not A/B in the same way; it groups release history over versions.
+
+
+ +
+
+
+
Spatial and Visual Pages
+

Use these when numbers are not enough.

+
+
+
+

Bounding Box Viewer

+

BEV inspection from parquet data. Filter by t4dataset, topic, label, visibility, source, status, frame, and run. Best for understanding where misses and false positives happen spatially.

+
+
+

T4 3D Viewer

+

3D-oriented visual inspection and T4 visualizer integration. Best when BEV alone is not enough and users need camera or rendered context.

+
+
+

T4 Dataset Server

+

Integration helper for liveness checks, render requests, target object JSON, and camera PNG embed workflows. More operational than analysis-focused.

+
+
+
+
+ +
+
+
+
Specialized Pages
+

Tools for narrower investigations and operations.

+
+
+
+

TLR Analysis

+

Traffic Light Recognition evaluation. Use after downloading scenario data from Download Scenarios. It visualizes criteria matrices, vehicle status vs signal type, important zones, and compare-mode deltas.

+
+
+

Parquet Debug

+

Developer troubleshooting page for parquet, pkl, and result JSON. Use it when a page fails to parse data, schemas look suspicious, or criteria state needs low-level inspection.

+
+
+

Help

+

In-app README viewer with Japanese/English switching. Useful when users are inside Streamlit and need setup or workflow reminders without leaving the app.

+
+
+

Deployment Debug

+

Docker-only operations page. Checks environment, Postgres, Redis, RQ, task rows, container status, logs, and optional restricted exec. Keep access controlled.

+
+
+
+
+
+ + + + diff --git a/evaluation_dashboard_app/docs/guide/specsheet.html b/evaluation_dashboard_app/docs/guide/specsheet.html new file mode 100644 index 0000000..3f3fafa --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/specsheet.html @@ -0,0 +1,274 @@ + + + + + + Specsheet Details + + + +
+
+
Specsheet
+

Specsheet Details

+

+ The specsheet export creates a release-oriented PDF from the selected run, optional trend metadata, + and the external perception_catalog_analyzer library. +

+
+
+ + + +
+
+
+
+
Overview
+

What the specsheet export does.

+

+ The dashboard handles the UI, selected run path, metadata, progress, and local artifact setup. + The analyzer library handles the metric blocks, template rendering, plots, and PDF output. +

+
+
+
+ 1. Select a run + Overview.py gathers project, version, topic, labels, and optional trend metadata. +
+
+ 2. Prepare files + ensure_specsheet_csvs() creates current.csv and future.csv when needed. +
+
+ 3. Build blocks + SceneDataFrame.from_dir() and get_blocks() produce abstract and detailed sections. +
+
+ 4. Add trend context + metadata.yaml and summary.json files are classified and converted into trend rows and plots. +
+
+ 5. Render PDF + update_template() creates HTML, then specsheet() writes specsheet.pdf. +
+
+
+
+ +
+
+
+
App and Library
+

The integration boundary is mostly in lib/specsheet_report.py.

+
+ + + + + + + + + + + + + + + + + + + + + + + + +
LayerKey file/moduleResponsibility
Streamlit UIOverview.pyCollects project identity, version, topic, labels, selected run, trend toggle, and user-facing progress.
Dashboard wrapperlib/specsheet_report.pyDefines artifact paths, adapts analyzer signatures, discovers trend files, classifies summaries, and prepares plot paths.
Analyzer libraryperception_catalog_analyzerLoads scene data, generates specsheet metric blocks, renders template HTML, creates plots, and writes the final PDF.
Local artifactsdata/<run>/...Stores run CSV/parquet files, trend metadata, summary files, generated PNGs, HTML, and specsheet.pdf.
+
+
+ +
+
+
+
Files
+

Specsheet input and output files live inside the run folder.

+

+ Trend files can come from a standalone dashboard run or from a grouped release folder. In both cases, + each trend item needs a metadata.yaml file next to its summary.json. +

+
+
+
+

Standalone run shape

+
+
data/my_run/
+
current.csv
+
future.csv
+
resources/
+
metadata.yaml
+
summary.json
+
specsheet/
+
specsheet.html
+
specsheet.pdf
+
+
+
+

Grouped release shape

+
+
data/trend_release_full_usecase_devops/
+
perception.object_recognition.objects/
+
<full_job_id>/metadata.yaml + summary.json
+
<usecase_job_id>/metadata.yaml + summary.json
+
<devops_job_id>/metadata.yaml + summary.json
+
specsheet/
+
map_trend.png
+
devops_trend.png
+
specsheet.pdf
+
+
+
+
+ Note: discover_trend_metadata_files() scans the data root for + metadata/summary pairs, and discover_trend_release_groups() decides how those files + should be grouped for the release PDF. +
+
+
+ +
+
+
+
Trend Data
+

Trend summaries are classified by JSON shape.

+

+ Metadata provides release identity. The summary payload decides whether the item is a full, + usecase, devops, or unknown trend source. +

+
+ + + + + + + + + + + + + + + + + + + +
RoleHow it is recognizedSpecsheet use
Full performancesummary.json has blocks containing 全数データセット評価.Feeds mAP, precision, recall, error, and prediction trend sections.
Usecasesummary.json has blocks containing ユースケース評価.Participates in release grouping and inventory context.
DevOps pass-rateSummary is a nested dictionary without blocks, with category results containing passed and total.Feeds overall pass-rate trend and pass-rate detail plots.
+
+
+ +
+
+
+
PDF Assembly
+

The final PDF is assembled from analyzer HTML plus dashboard trend context.

+
+
+
+

get_blocks()

+

+ Creates abstract and detailed metric fragments for labels, metrics, and evaluation type. + These fragments become the main technical body of the PDF. +

+
+
+

update_template()

+

+ Receives project/version metadata and trend context, then renders the analyzer template body. + Generated PNG paths are included when trend plots exist. +

+
+
+

specsheet()

+

+ Combines body HTML, abstract HTML, and detailed HTML, then writes + specsheet/specsheet.html and specsheet/specsheet.pdf. +

+
+
+

Trend plots

+

+ Full performance trends can generate map_trend.png and + prediction_trend.png. DevOps summaries can generate + devops_trend.png and devops_trend_detail.png. +

+
+
+
+
+ +
+
+
+
Debugging
+

When a specsheet section is missing, check the data contract.

+
+ + + + + + + + + + + + + + + + + + + + + + + + +
SymptomWhat to checkExpected condition
No trend sectionTrend metadata discoveryAt least one metadata.yaml has a sibling summary.json.
Full trend is missingFull summary rolesummary.json has full-performance blocks and generated full trend rows.
Pass Rate Trend is missingDevOps summary and plot filesDevOps rows are non-empty and devops_trend.png exists in the specsheet output folder.
PDF says no dataTemplate contextThe relevant trend list is non-empty before calling update_template().
+
+
quick local verification
+
PYTHONPATH=. python - <<'PY'
+from pathlib import Path
+from lib.specsheet_report import _build_trend_context, discover_trend_metadata_files
+
+ctx = _build_trend_context(discover_trend_metadata_files(), Path("/tmp/specsheet-trend-check"))
+print(len(ctx["performance_trend_data"]), len(ctx["devops_trend_data"]))
+PY
+
+
+
+
+ + + + + diff --git a/evaluation_dashboard_app/docs/guide/styles.css b/evaluation_dashboard_app/docs/guide/styles.css new file mode 100644 index 0000000..3459c9f --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/styles.css @@ -0,0 +1,848 @@ +:root { + --bg: #f6f8fb; + --paper: #ffffff; + --ink: #101827; + --muted: #5d697d; + --line: #dce4f0; + --blue: #2563eb; + --teal: #0f766e; + --cyan: #0891b2; + --gold: #b7791f; + --red: #be123c; + --violet: #6d28d9; + --dark: #111827; + --shadow: 0 20px 60px rgba(17, 24, 39, .12); + --soft-shadow: 0 12px 30px rgba(17, 24, 39, .07); + --radius: 8px; +} + +* { box-sizing: border-box; } +html { scroll-behavior: smooth; } +body { + margin: 0; + color: var(--ink); + background: + radial-gradient(circle at 14% 8%, rgba(37, 99, 235, .12), transparent 26rem), + radial-gradient(circle at 86% 16%, rgba(15, 118, 110, .12), transparent 24rem), + linear-gradient(180deg, #f8fbff 0%, #ffffff 34%, #f6f8fb 100%); + font-family: Inter, ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; + line-height: 1.58; +} + +body::before { + content: ""; + position: fixed; + inset: 0; + pointer-events: none; + z-index: -1; + background-image: + linear-gradient(rgba(17, 24, 39, .042) 1px, transparent 1px), + linear-gradient(90deg, rgba(17, 24, 39, .042) 1px, transparent 1px); + background-size: 44px 44px; + mask-image: linear-gradient(180deg, rgba(0,0,0,.7), transparent 70%); +} + +a { color: inherit; } +code, pre { font-family: "SFMono-Regular", Consolas, "Liberation Mono", monospace; } +[hidden] { display: none !important; } + +.shell { + width: min(1180px, calc(100% - 36px)); + margin: 0 auto; +} + +.hero { + min-height: 72vh; + display: grid; + align-items: center; + padding: 56px 0 36px; +} + +.hero.compact { + min-height: 46vh; +} + +.hero-grid { + display: grid; + grid-template-columns: minmax(0, 1fr) minmax(330px, 440px); + gap: 46px; + align-items: center; +} + +.eyebrow { + display: inline-flex; + align-items: center; + gap: 10px; + color: var(--teal); + font-size: .78rem; + font-weight: 900; + letter-spacing: .14em; + text-transform: uppercase; +} + +.signal { + width: 11px; + height: 11px; + border-radius: 99px; + background: var(--teal); + animation: ping 1.9s infinite; +} + +h1 { + margin: 16px 0 18px; + font-size: clamp(3.2rem, 7.6vw, 7.8rem); + line-height: .88; + letter-spacing: 0; + max-width: 980px; +} + +h2 { + margin: 0 0 16px; + font-size: clamp(2rem, 4vw, 4.2rem); + line-height: 1; + letter-spacing: 0; +} + +h3 { + margin: 0 0 10px; + font-size: 1.12rem; + line-height: 1.24; +} + +p { margin: 0; } + +.lead { + max-width: 860px; + color: var(--muted); + font-size: 1.15rem; +} + +.actions { + display: flex; + gap: 12px; + flex-wrap: wrap; + margin-top: 28px; +} + +.button { + display: inline-flex; + align-items: center; + gap: 10px; + min-height: 44px; + padding: 11px 15px; + border-radius: var(--radius); + border: 1px solid var(--line); + background: var(--paper); + color: var(--ink); + text-decoration: none; + font-weight: 820; + box-shadow: 0 8px 18px rgba(17, 24, 39, .07); +} + +.button.primary { + color: white; + background: var(--dark); + border-color: var(--dark); +} + +.button:hover { transform: translateY(-1px); } + +.language-console { + display: inline-flex; + align-items: center; + gap: 8px; + flex-wrap: wrap; + margin-top: 24px; + padding: 8px; + border: 1px solid var(--line); + border-radius: var(--radius); + background: rgba(255,255,255,.78); + box-shadow: var(--soft-shadow); + color: var(--muted); + font-weight: 850; + font-size: .9rem; +} + +.lang-button { + min-height: 34px; + border: 1px solid var(--line); + border-radius: 7px; + padding: 7px 11px; + background: white; + color: #334155; + font: inherit; + font-weight: 900; + cursor: pointer; +} + +.lang-button.active { + color: white; + background: var(--dark); + border-color: var(--dark); +} + +.hero-console { + border-top: 4px solid var(--teal); +} + +.metric-grid { + margin-top: 26px; +} + +.metric { + min-height: 140px; + display: flex; + flex-direction: column; + gap: 10px; +} + +.metric strong { + font-size: 1.05rem; +} + +.metric span { + color: var(--muted); +} + +nav { + position: sticky; + top: 0; + z-index: 30; + background: rgba(248, 251, 255, .88); + border-block: 1px solid rgba(220,228,240,.9); + backdrop-filter: blur(14px); +} + +.nav-inner { + display: flex; + gap: 8px; + align-items: center; + padding: 12px 0; + overflow-x: auto; +} + +.nav-inner a { + text-decoration: none; + white-space: nowrap; + color: #334155; + font-size: .88rem; + font-weight: 820; + padding: 8px 10px; + border-radius: 7px; +} + +.nav-inner a:hover, .nav-inner a.active { + background: white; + color: var(--blue); +} + +section { + padding: 78px 0; + position: relative; +} + +.section-head { + max-width: 900px; + margin-bottom: 32px; +} + +.kicker { + color: var(--blue); + font-size: .78rem; + font-weight: 950; + letter-spacing: .14em; + text-transform: uppercase; + margin-bottom: 12px; +} + +.grid { display: grid; gap: 18px; } +.cols-2 { grid-template-columns: repeat(2, minmax(0, 1fr)); } +.cols-3 { grid-template-columns: repeat(3, minmax(0, 1fr)); } +.cols-4 { grid-template-columns: repeat(4, minmax(0, 1fr)); } + +.card { + background: rgba(255,255,255,.92); + border: 1px solid var(--line); + border-radius: var(--radius); + padding: 20px; + box-shadow: var(--soft-shadow); +} + +.card p, .card li { color: var(--muted); } +.card ul { margin: 12px 0 0; padding-left: 18px; } + +.number { + width: 34px; + height: 34px; + display: inline-grid; + place-items: center; + border-radius: 8px; + background: var(--blue); + color: white; + font-weight: 950; + margin-bottom: 13px; +} + +.flow { + display: grid; + grid-template-columns: repeat(5, minmax(132px, 1fr)); + gap: 12px; + align-items: stretch; + margin-top: 24px; +} + +.compact-flow { + grid-template-columns: 1fr; + margin-top: 0; +} + +.compact-flow .step { + min-height: auto; +} + +.compact-flow .step::after { + display: none; +} + +.step { + min-height: 155px; + padding: 16px; + border: 1px solid var(--line); + border-radius: var(--radius); + background: white; + position: relative; + overflow: hidden; + animation: lift .6s ease both; +} + +.step:nth-child(2) { animation-delay: .08s; } +.step:nth-child(3) { animation-delay: .16s; } +.step:nth-child(4) { animation-delay: .24s; } +.step:nth-child(5) { animation-delay: .32s; } + +.step::after { + content: ""; + position: absolute; + top: 50%; + right: -23px; + width: 42px; + height: 2px; + background: linear-gradient(90deg, var(--blue), transparent); +} + +.step:last-child::after { display: none; } +.step strong { display: block; margin-bottom: 8px; } +.step span { color: var(--muted); font-size: .9rem; } + +.split { + display: grid; + grid-template-columns: minmax(0, 1.05fr) minmax(0, .95fr); + gap: 20px; + align-items: start; +} + +.media { + overflow: hidden; + border-radius: var(--radius); + border: 1px solid var(--line); + background: white; + box-shadow: var(--soft-shadow); +} + +.media img { + width: 100%; + display: block; + object-fit: cover; +} + +.media-grid { + display: grid; + grid-template-columns: repeat(3, minmax(0, 1fr)); + gap: 18px; + margin-top: 28px; +} + +.caption { + color: var(--muted); + font-size: .86rem; + padding: 12px 14px; + border-top: 1px solid var(--line); + background: #fbfdff; +} + +.code { + background: #101827; + color: #dbeafe; + border-radius: var(--radius); + border: 1px solid rgba(255,255,255,.08); + overflow: hidden; + box-shadow: 0 16px 44px rgba(15, 23, 42, .18); +} + +.code-title { + display: flex; + justify-content: space-between; + gap: 12px; + padding: 12px 15px; + border-bottom: 1px solid rgba(255,255,255,.1); + color: #bfdbfe; + font-weight: 850; + font-size: .88rem; +} + +pre { + margin: 0; + padding: 17px; + overflow: auto; + font-size: .82rem; + line-height: 1.55; +} + +.tree { + background: white; + border: 1px solid var(--line); + border-radius: var(--radius); + padding: 18px; + font-family: "SFMono-Regular", Consolas, monospace; + overflow-x: auto; + color: #334155; + box-shadow: var(--soft-shadow); +} + +.tree div { + white-space: nowrap; + opacity: 0; + transform: translateX(-10px); + animation: treeIn .45s ease forwards; +} + +.tree div:nth-child(2) { animation-delay: .04s; } +.tree div:nth-child(3) { animation-delay: .08s; } +.tree div:nth-child(4) { animation-delay: .12s; } +.tree div:nth-child(5) { animation-delay: .16s; } +.tree div:nth-child(6) { animation-delay: .20s; } +.tree div:nth-child(7) { animation-delay: .24s; } +.tree div:nth-child(8) { animation-delay: .28s; } +.tree div:nth-child(9) { animation-delay: .32s; } +.tree div:nth-child(10) { animation-delay: .36s; } + +.pill { + display: inline-flex; + align-items: center; + gap: 8px; + min-height: 28px; + padding: 5px 9px; + border: 1px solid var(--line); + border-radius: 999px; + background: #fbfdff; + color: #334155; + font-size: .8rem; + font-weight: 780; + margin: 3px 4px 3px 0; +} + +.callout { + border-left: 5px solid var(--teal); + background: #ecfdf5; + color: #123f38; + border-radius: var(--radius); + padding: 18px; +} + +.callout.warn { + border-left-color: var(--gold); + background: #fff8e7; + color: #513a13; +} + +.sequence { + display: grid; + grid-template-columns: repeat(var(--cols, 5), minmax(110px, 1fr)); + gap: 10px; + margin: 22px 0; + position: relative; +} + +.actor { + min-height: 78px; + border: 1px solid var(--line); + border-radius: var(--radius); + background: white; + display: grid; + place-items: center; + text-align: center; + padding: 12px; + box-shadow: var(--soft-shadow); + font-weight: 900; +} + +.actor small { + display: block; + color: var(--muted); + font-weight: 750; + margin-top: 4px; +} + +.message { + grid-column: 1 / -1; + display: grid; + grid-template-columns: subgrid; + min-height: 44px; + align-items: center; +} + +.arrow { + height: 28px; + border-top: 2px solid var(--blue); + position: relative; + display: flex; + align-items: flex-start; + justify-content: center; + color: var(--muted); + font-size: .78rem; + font-weight: 800; + padding-top: 6px; +} + +.arrow::after { + content: ""; + position: absolute; + right: -2px; + top: -6px; + border-left: 9px solid var(--blue); + border-top: 5px solid transparent; + border-bottom: 5px solid transparent; +} + +.arrow.back { + border-color: var(--teal); +} + +.arrow.back::after { + right: auto; + left: -2px; + border-left: 0; + border-right: 9px solid var(--teal); +} + +.span-1-2 { grid-column: 1 / 3; } +.span-2-3 { grid-column: 2 / 4; } +.span-3-4 { grid-column: 3 / 5; } +.span-4-5 { grid-column: 4 / 6; } +.span-1-3 { grid-column: 1 / 4; } +.span-2-4 { grid-column: 2 / 5; } +.span-3-5 { grid-column: 3 / 6; } +.span-2-5 { grid-column: 2 / 6; } + +.system-map { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 14px; + align-items: stretch; + margin-top: 22px; +} + +.system-node { + background: white; + border: 1px solid var(--line); + border-radius: var(--radius); + padding: 16px; + min-height: 132px; + box-shadow: var(--soft-shadow); + position: relative; + overflow: hidden; +} + +.system-node::before { + content: ""; + position: absolute; + inset: 0 auto 0 0; + width: 5px; + background: var(--blue); +} + +.system-node.teal::before { background: var(--teal); } +.system-node.gold::before { background: var(--gold); } +.system-node.cyan::before { background: var(--cyan); } +.system-node.violet::before { background: var(--violet); } +.system-node.red::before { background: var(--red); } + +.system-node p { + color: var(--muted); +} + +.mini-diagram { + border: 1px solid var(--line); + border-radius: var(--radius); + background: white; + padding: 18px; + box-shadow: var(--soft-shadow); + overflow-x: auto; +} + +.real-diagram { + background: white; + border: 1px solid var(--line); + border-radius: var(--radius); + box-shadow: var(--soft-shadow); + overflow: auto; + margin: 22px 0; +} + +.real-diagram svg { + display: block; + min-width: 980px; + width: 100%; + height: auto; +} + +.svg-title { + font: 800 18px Inter, ui-sans-serif, system-ui, sans-serif; + fill: #101827; +} + +.svg-actor { + fill: #ffffff; + stroke: #cbd5e1; + stroke-width: 1.2; +} + +.svg-actor-text { + font: 800 13px Inter, ui-sans-serif, system-ui, sans-serif; + fill: #101827; +} + +.svg-small { + font: 700 11px Inter, ui-sans-serif, system-ui, sans-serif; + fill: #64748b; +} + +.svg-line { + stroke: #cbd5e1; + stroke-width: 1.2; + stroke-dasharray: 5 6; +} + +.svg-msg { + stroke: #2563eb; + stroke-width: 2; + fill: none; + marker-end: url(#arrow-blue); +} + +.svg-msg-return { + stroke: #0f766e; + stroke-width: 2; + fill: none; + stroke-dasharray: 7 5; + marker-end: url(#arrow-teal); +} + +.svg-note { + fill: #f8fafc; + stroke: #dbe5f2; +} + +.svg-note-warn { + fill: #fff8e7; + stroke: #f2d38b; +} + +.svg-note-text { + font: 700 12px Inter, ui-sans-serif, system-ui, sans-serif; + fill: #334155; +} + +.svg-step { + font: 800 12px Inter, ui-sans-serif, system-ui, sans-serif; + fill: #1e3a8a; +} + +.payload-grid { + display: grid; + grid-template-columns: repeat(3, minmax(0, 1fr)); + gap: 14px; + margin-top: 18px; +} + +.payload-card { + background: #101827; + color: #dbeafe; + border-radius: var(--radius); + border: 1px solid rgba(255,255,255,.08); + overflow: hidden; +} + +.payload-card h3 { + padding: 12px 14px; + border-bottom: 1px solid rgba(255,255,255,.1); + color: #bfdbfe; + font-size: .92rem; +} + +.payload-card pre { + font-size: .76rem; +} + +.legend-row { + display: flex; + gap: 10px; + flex-wrap: wrap; + margin-top: 12px; +} + +.legend-item { + display: inline-flex; + align-items: center; + gap: 8px; + color: var(--muted); + font-weight: 760; + font-size: .86rem; +} + +.legend-swatch { + width: 22px; + height: 4px; + border-radius: 999px; + background: var(--blue); +} + +.legend-swatch.return { + background: repeating-linear-gradient(90deg, var(--teal) 0 7px, transparent 7px 12px); + border: 1px solid rgba(15,118,110,.25); +} + +.swimlanes { + display: grid; + gap: 12px; +} + +.swimlane { + display: grid; + grid-template-columns: 180px minmax(0, 1fr); + gap: 12px; + align-items: stretch; +} + +.swimlane-label { + border-radius: var(--radius); + background: var(--dark); + color: white; + padding: 14px; + display: grid; + align-items: center; + font-weight: 900; +} + +.swimlane-flow { + display: flex; + gap: 10px; + flex-wrap: wrap; + align-items: center; + border: 1px solid var(--line); + background: #fbfdff; + border-radius: var(--radius); + padding: 12px; +} + +.chip { + border: 1px solid var(--line); + border-radius: 8px; + background: white; + padding: 9px 10px; + font-size: .84rem; + font-weight: 820; + color: #334155; +} + +.table, +.table-wrap table { + width: 100%; + border-collapse: collapse; + overflow: hidden; + border-radius: var(--radius); + background: white; + border: 1px solid var(--line); + box-shadow: var(--soft-shadow); +} + +.table-wrap { + overflow-x: auto; + border-radius: var(--radius); +} + +.table th, .table td, +.table-wrap th, .table-wrap td { + text-align: left; + padding: 12px 14px; + border-bottom: 1px solid var(--line); + vertical-align: top; +} + +.table th, +.table-wrap th { + background: #f8fafc; + font-size: .82rem; + text-transform: uppercase; + letter-spacing: .08em; +} + +.table td, +.table-wrap td { color: var(--muted); } + +.feature-card { + border-top: 5px solid var(--blue); +} +.feature-card.analysis { border-top-color: var(--teal); } +.feature-card.spatial { border-top-color: var(--cyan); } +.feature-card.ops { border-top-color: var(--gold); } +.feature-card.advanced { border-top-color: var(--violet); } + +.footer { + padding: 48px 0 70px; + border-top: 1px solid var(--line); + color: var(--muted); +} + +.reveal { + opacity: 0; + transform: translateY(18px); + transition: opacity .6s ease, transform .6s ease; +} + +.reveal.in { + opacity: 1; + transform: translateY(0); +} + +@keyframes ping { + 0% { box-shadow: 0 0 0 0 rgba(15, 118, 110, .45); } + 72% { box-shadow: 0 0 0 13px rgba(15, 118, 110, 0); } + 100% { box-shadow: 0 0 0 0 rgba(15, 118, 110, 0); } +} + +@keyframes lift { + from { opacity: 0; transform: translateY(18px); } + to { opacity: 1; transform: translateY(0); } +} + +@keyframes treeIn { + to { opacity: 1; transform: translateX(0); } +} + +@media (max-width: 980px) { + .hero-grid, .split, .cols-2, .cols-3, .cols-4, .media-grid { + grid-template-columns: 1fr; + } + .flow { grid-template-columns: 1fr; } + .step::after { display: none; } +} + +@media (prefers-reduced-motion: reduce) { + *, *::before, *::after { + animation-duration: 0.01ms !important; + animation-iteration-count: 1 !important; + transition-duration: 0.01ms !important; + scroll-behavior: auto !important; + } +} diff --git a/evaluation_dashboard_app/docs/guide/visual_systems.html b/evaluation_dashboard_app/docs/guide/visual_systems.html new file mode 100644 index 0000000..384021c --- /dev/null +++ b/evaluation_dashboard_app/docs/guide/visual_systems.html @@ -0,0 +1,555 @@ + + + + + + Evaluation Dashboard Technical Diagrams + + + +
+
+
Technical Diagrams
+

Real Flows

+

+ Precise diagrams for the parts users and maintainers actually ask about: download/eval execution, + compare-mode state, page artifact dependencies, T4 camera rendering, T4 Three.js 3D overlays, + production queueing, and report generation. +

+
+
+ + +
+
+
+
+
Artifact Dependency Map
+

Which generated files unlock which pages?

+

+ This is the first diagram to check when a user asks why a page is empty. + Most UI behavior follows directly from whether these files exist under the selected run. +

+
+
+ + + + + + + + + + Run folder artifact dependency map + + + data/<run>/ + Summary.csv + Score.csv + *.parquet + result.txt / score.json + resources/metadata.yaml + resources/summary.json + specsheet/*.png + specsheet/specsheet.pdf + + + Overview + run selection + summary + + + TP Summary + TP/RMS/velocity charts + + + Criteria Score + pass rate + gates + + + Detection Stats + DuckDB + parquet scan + + + Bounding Box / T4 3D + BEV + Three.js overlays + + + Prediction Evaluation + ADE/FDE matrices + + + Trend Insights + release metadata + summaries + + + Download / Workflow + creates / refreshes files + + + Reports + dashboard PDF + specsheet + + + + + + + + + + + Blue = page reads artifact. Dashed green = workflow produces or refreshes artifact. + +
+
+
+ +
+
+
+
Real Sequence
+

Download Results -> Eval Results -> Overview selection.

+

+ This sequence shows both inline and production task-queue modes. In production, the UI does not run long jobs directly. +

+
+
+ + + + + + + + + + Sequence: user downloads a job, generates CSV artifacts, then opens Overview + + + + User + + + + + Streamlit + Download page + + + + + Redis / RQ + queue mode + + + + + Worker + heavy tasks + + + + + Evaluator + API / files + + + + + Data + root + + + + + 1. Submit Project ID, Job ID, output path + + + 2a. Queue task if USE_TASK_QUEUE=true + + 2b. Worker consumes RQ job + + + If queue mode is off, Streamlit + runs this work inline. + + + 3. Download archives / result JSON / scenario data + + 4. API response / downloaded files + + + 5. Write result.txt, score.json, extracted archives + + + 6. User runs Eval Results for same root + + 7. Generate Summary.csv + Score.csv + + 8. Overview lists data/<run> and reads generated artifacts + +
+
+
+ +
+
+
+
T4 Camera Rendering
+

Bounding Box Viewer / T4 Dataset Server: HTTP render path.

+

+ This path is for camera PNGs or HTML camera render previews. It is separate from the Three.js 3D overlay path below. +

+
+
+ + + + + + + + + + Sequence: camera preview render through T4 visualizer HTTP API + + + Streamlit + + + T4 Client + requests wrapper + + + T4 Server + FastAPI + + + Dataset + local T4 files + + + Browser + PNG / iframe + + + + 1. User selects server base URL, dataset, scenario, frame + + 2. GET /health, /datasets, /datasets/{id}/scenarios + + 3. Server reads available datasets and scene metadata + + 4. JSON lists: ids, scenarios, frame counts + + + 5. Build RenderRequest from UI and optional GT rows + + 6. POST /render {dataset, scenario, frame, target_objects} + + 7. Load camera/sample data and draw annotations + + 8. RenderResult JSON with images[].png_base64 + + 9. Streamlit decodes/display PNGs or embeds /render/html iframe + +
+
+
+

RenderRequest body

+
{
+  "t4dataset_id": "...",
+  "scenario_name": "...",
+  "frame_index": 42,
+  "target_objects": [{ "uuid": "...", "x": 1.2 }],
+  "show_annotations": true,
+  "crop_cameras": false
+}
+
+
+

RenderResult response

+
{
+  "sample_token": "...",
+  "timestamp_us": 123,
+  "images": [
+    { "label": "CAM_FRONT", "png_base64": "..." }
+  ],
+  "elapsed_ms": 812.4
+}
+
+
+

Source code touchpoints

+
lib/t4_visualizer_client.py
+lib/t4_dataset_embed.py
+pages/11_T4_Dataset_Server.py
+pages/4_Bounding_Box_Viewer.py
+
+
+
+
+ +
+
+
+
T4 3D Rendering
+

Three.js overlay path: parquet -> all-frame layers -> iframe postMessage.

+

+ This is the precise flow used by pages/5_T4_3D_Viewer.py. The app deliberately uses + the viewer’s own frame slider: Streamlit loads the iframe once, sends all frame overlays, and the + viewer selects overlays internally as the user scrubs time. +

+
+
+ + + + + + + + + + Sequence: T4 3D Viewer iframe + postMessage overlay synchronization + + + User + browser + + + + Streamlit + 5_T4_3D_Viewer + + + + DuckDB + parquet_scan + + + + Layer Builder + t4_three_layers + + + + T4 Server + /viewer/three + + + + Three.js iframe + viewer runtime + + + + 1. Open T4 3D Viewer after Overview selected run(s) + + + 2. DESCRIBE + SELECT parquet_scan(?) with filters + + + 3. DataFrame rows: frame_index, source GT/EST, status, geometry + + + Filters come from shared BEV keys: + suite, scenario, t4dataset, topic, label, visibility, runs. + frame_index is normalized to int. + + + 4. GET /datasets/{t4dataset_id}/availability + + 5. { available: true/false, dataset path metadata } + + + 6. build_three_layer_payload_all_frames(df) + + + For each frame_index: + source == GT -> gt[] boxes + source == EST -> pred[] boxes + TP pair_uuid/uuid -> matched_pairs[] + + + 7. Payload: { type: "bbox_layers_by_frame", frames: { "0": ... } } + + + 8. iframe src = {base}/viewer/three?t4dataset_id=...&scenario_name=...&frame_index=min + + + 9. Viewer loads dataset/scenario and its own time slider + + + 10. JS hex-decodes payload and iframe.contentWindow.postMessage(payload, targetOrigin) + + + 11. User scrubs inside viewer; runtime selects frames[frame_index] without Streamlit rerun + +
+ +
+
+

Layer payload

+
{
+  "type": "bbox_layers_by_frame",
+  "frames": {
+    "42": {
+      "gt": [{ "x": 1.0, "source": "GT" }],
+      "pred": [{ "x": 1.2, "source": "EST" }],
+      "matched_pairs": [
+        { "gt_idx": 0, "pred_idx": 0, "pair_uuid": "..." }
+      ]
+    }
+  }
+}
+
+
+

Iframe URL

+
viewer_three_url =
+  T4_VISUALIZER_BASE_URL
+  + "/viewer/three?"
+  + "t4dataset_id=..."
+  + "&scenario_name=..."
+  + "&frame_index=min_frame"
+
+
+

Post timing

+
post("iframe-load")
+retry every 250ms up to 12 times
+post("initial-delay-300ms")
+post("initial-delay-1200ms")
+
+targetOrigin = new URL(iframe.src).origin
+
+
+ +
+ Key distinction: the 3D viewer does not call POST /render for every frame. + It embeds /viewer/three once and sends all-frame overlay data via postMessage. + Camera PNG rendering is a separate HTTP render path. +
+
+
+ +
+
+
+
Compare Mode State
+

Overview is the state source for comparison pages.

+
+
+ + + + + + + Compare mode data/state propagation + + + Overview + mode = Compare Mode + runA = Baseline + runB / all_runs = Candidates + run_labels = A, B, C... + label filters + query params run_a/run_b + + + Session State + URL Hydration + st.session_state stores run objects + overview_url_hydrate can rebuild + state from query parameters + important with multiple Streamlit + replicas or direct subpage links + + + Pages consume shared state + TP Summary -> ΔTP / metric deltas + Criteria -> pass-rate / gate deltas + Detection Stats -> status + distance diffs + Bounding Box -> side-by-side/overlay BEV + T4 3D -> selected run layers + Prediction -> ADE/FDE delta matrices + + + write + + read + +
+
+
+ +
+
+
+
Report Generation
+

Dashboard PDF and specsheet PDF are different engines.

+
+
+ + + + + + + + + + Two report paths + + + Overview selection + mode, run(s), filters + Summary.csv charts + + + Dashboard PDF + lib/overview_pdf_report.py + curated dashboard snapshot + current view + selected filters + + + Release Specsheet PDF + lib/specsheet_report.py + perception_catalog_analyzer + blocks + trend plots + template + + + overview_report.pdf + dashboard narrative + + + specsheet.pdf + release specsheet + + + + + + Specsheet is an advanced report path. Most users first use the dashboard pages and dashboard PDF. + For full detail, open the Specsheet guide page. + +
+ +
+
+
+ + + + + diff --git a/evaluation_dashboard_app/docs/specsheet_pipeline_explainer.html b/evaluation_dashboard_app/docs/specsheet_pipeline_explainer.html new file mode 100644 index 0000000..0fb205f --- /dev/null +++ b/evaluation_dashboard_app/docs/specsheet_pipeline_explainer.html @@ -0,0 +1,13 @@ + + + + + + + Specsheet Details + + + +

Open Specsheet Details

+ + diff --git a/evaluation_dashboard_app/lib/auth.py b/evaluation_dashboard_app/lib/auth.py index b15d5dc..c29c6eb 100644 --- a/evaluation_dashboard_app/lib/auth.py +++ b/evaluation_dashboard_app/lib/auth.py @@ -4,8 +4,10 @@ a header with the user identity. When enabled, users see only their own tasks. """ +import base64 +import json import os -from typing import Optional +from typing import Any, Dict, Optional # Header name set by auth proxy (e.g. X-Forwarded-User, X-Auth-User). Empty = no auth filtering. AUTH_USER_HEADER = os.environ.get("AUTH_USER_HEADER", "").strip() @@ -14,6 +16,108 @@ AUTH_DEFAULT_USER = os.environ.get("AUTH_DEFAULT_USER", "").strip() or None +def _first_nonempty_string(*values: Any) -> str: + """Return the first non-empty string-like value, else empty string.""" + for value in values: + text = str(value or "").strip() + if text: + return text + return "" + + +def _read_streamlit_headers() -> Dict[str, str]: + """Best-effort request headers from Streamlit context.""" + try: + import streamlit as st + + ctx = getattr(st, "context", None) + headers = getattr(ctx, "headers", None) if ctx else None + if callable(headers): + headers = headers() + if isinstance(headers, dict): + normalized: Dict[str, str] = {} + for key, value in headers.items(): + if not isinstance(key, str): + continue + normalized[key] = str(value) + return normalized + except Exception: + pass + return {} + + +def _decode_jwt_payload(token: str) -> Dict[str, Any]: + """Best-effort JWT payload decode without signature verification, for display only.""" + raw = str(token or "").strip() + if not raw: + return {} + parts = raw.split(".") + if len(parts) < 2: + return {} + payload = parts[1] + padding = "=" * (-len(payload) % 4) + try: + decoded = base64.urlsafe_b64decode(payload + padding) + data = json.loads(decoded.decode("utf-8")) + return data if isinstance(data, dict) else {} + except Exception: + return {} + + +def _extract_identity_from_bearer_token(headers: Dict[str, str]) -> Dict[str, Any]: + """Extract subject / email / username / display name from common bearer token claims.""" + authz = str(headers.get("Authorization") or headers.get("authorization") or "").strip() + if not authz.lower().startswith("bearer "): + return {} + token = authz.split(" ", 1)[1].strip() + payload = _decode_jwt_payload(token) + if not payload: + return {} + + session = payload.get("session") or {} + identity = session.get("identity") or {} + traits = identity.get("traits") or {} + name = traits.get("name") or {} + oauth_username = _first_nonempty_string( + payload.get("preferred_username"), + payload.get("username"), + payload.get("upn"), + payload.get("unique_name"), + payload.get("cognito:username"), + traits.get("username"), + identity.get("username"), + ) + full_name = " ".join( + part for part in [str(name.get("first") or "").strip(), str(name.get("last") or "").strip()] if part + ).strip() + display_name = _first_nonempty_string( + payload.get("name"), + full_name, + traits.get("display_name"), + identity.get("display_name"), + oauth_username, + traits.get("email"), + ) + email = _first_nonempty_string( + payload.get("email"), + payload.get("upn"), + traits.get("email"), + identity.get("email"), + ) + subject_id = _first_nonempty_string( + payload.get("sub"), + session.get("account", {}).get("subject_id"), + identity.get("id"), + ) + return { + "subject_id": subject_id, + "email": email, + "username": oauth_username, + "name": display_name, + "claims": payload, + } + + def get_current_user_id() -> Optional[str]: """ Return the current user identifier, or None if auth is not configured. @@ -24,19 +128,10 @@ def get_current_user_id() -> Optional[str]: """ if not AUTH_USER_HEADER and not AUTH_DEFAULT_USER: return None - # Try to read header (Streamlit 1.37+) - try: - import streamlit as st - ctx = getattr(st, "context", None) - headers = getattr(ctx, "headers", None) if ctx else None - if callable(headers): - headers = headers() - if isinstance(headers, dict): - value = headers.get(AUTH_USER_HEADER) or headers.get(AUTH_USER_HEADER.lower()) - if value and isinstance(value, str) and value.strip(): - return value.strip() - except Exception: - pass + headers = _read_streamlit_headers() + value = headers.get(AUTH_USER_HEADER) or headers.get(AUTH_USER_HEADER.lower()) + if value and isinstance(value, str) and value.strip(): + return value.strip() return AUTH_DEFAULT_USER diff --git a/evaluation_dashboard_app/lib/criteria_absolute_gates.py b/evaluation_dashboard_app/lib/criteria_absolute_gates.py index c23eaa9..2a10770 100644 --- a/evaluation_dashboard_app/lib/criteria_absolute_gates.py +++ b/evaluation_dashboard_app/lib/criteria_absolute_gates.py @@ -11,6 +11,8 @@ import pandas as pd +from lib.score_schema import score_base_cols, score_identity_cols + MetricOp = Literal["<=", ">="] MAX_CRITERIA_DEFAULT = 32 @@ -22,11 +24,11 @@ def infer_criteria_count( max_criteria: int = MAX_CRITERIA_DEFAULT, ) -> int: """ - Number of criteria blocks in a raw Score dataframe (first 3 cols are base). + Number of criteria blocks in a raw Score dataframe. """ if df_raw is None or df_raw.shape[1] < 3: return 1 - n = (df_raw.shape[1] - 3) // block_size + n = (df_raw.shape[1] - len(score_base_cols(df_raw))) // block_size n = max(1, n) return int(min(n, max_criteria)) @@ -65,7 +67,7 @@ def evaluate_scenario_gates( raise ValueError(f"Metric column {metric_gate.column!r} not in df_view") empty_cols = [ - "Scenario", + *score_identity_cols(df_view), "agg_pass_rate", "metric_agg", "scenario_pass", @@ -82,7 +84,12 @@ def evaluate_scenario_gates( d[metric_gate.column] = pd.to_numeric(d[metric_gate.column], errors="coerce") rows: list[dict[str, Any]] = [] - for scen, grp in d.groupby("Scenario", observed=True): + identity_cols = score_identity_cols(d) + for key, grp in d.groupby(identity_cols, observed=True): + if len(identity_cols) == 1: + identity_values = {"Scenario": key[0] if isinstance(key, tuple) else key} + else: + identity_values = dict(zip(identity_cols, key)) rc = len(grp) pr = grp["pass_rate"] mean_pr = float(pr.mean()) @@ -113,7 +120,7 @@ def evaluate_scenario_gates( rows.append( { - "Scenario": scen, + **identity_values, "row_count": rc, "agg_pass_rate": mean_pr, "metric_agg": m_agg, diff --git a/evaluation_dashboard_app/lib/db.py b/evaluation_dashboard_app/lib/db.py index 7110d13..a678f08 100644 --- a/evaluation_dashboard_app/lib/db.py +++ b/evaluation_dashboard_app/lib/db.py @@ -29,6 +29,9 @@ def _task_log_timestamp_prefix() -> str: "run_eval_dirs", "generate_summary_csv", "build_parquet", + "download_and_eval", + "run_evaluator_and_process", + "run_release_specsheet_workflow", ) TASK_STATUSES = ("pending", "running", "completed", "failed") @@ -404,7 +407,7 @@ def update_task_result_summary(task_id: str, summary: Dict[str, Any]) -> bool: def get_task(task_id: str) -> Optional[Dict[str, Any]]: - """Return task row as dict (includes ``rq_job_id`` for RQ cancel / reconcile).""" + """Return task row as dict (includes ``rq_job_id`` and ``session_id`` when available).""" url = get_database_url() if not url: return None @@ -420,7 +423,7 @@ def get_task(task_id: str) -> Optional[Dict[str, Any]]: cur.execute( """SELECT id, type, status, parameters, result_path, error_message, progress_message, progress_pct, log_output, result_summary, rq_job_id, - created_at, updated_at + session_id, created_at, updated_at FROM tasks WHERE id = %s""", (task_id,), ) @@ -436,14 +439,17 @@ def get_task(task_id: str) -> Optional[Dict[str, Any]]: def list_recent_tasks( limit: int = 50, + offset: int = 0, session_id: Optional[str] = None, since_days: Optional[int] = None, + include_details: bool = False, ) -> List[Dict[str, Any]]: """Return recent tasks (newest first). If ``session_id`` is set, only that user's tasks. If ``since_days`` is set, only tasks with ``created_at`` within that many calendar days (from DB ``NOW()``). ``limit`` still caps row count. + ``include_details`` includes heavy log/result payloads; task list cards do not need them. """ url = get_database_url() if not url: @@ -457,7 +463,12 @@ def list_recent_tasks( conn = psycopg2.connect(url) try: with conn.cursor(cursor_factory=RealDictCursor) as cur: - cols = "id, type, status, parameters, result_path, error_message, progress_message, progress_pct, log_output, result_summary, rq_job_id, created_at, updated_at" + cols = ( + "id, type, status, parameters, result_path, error_message, " + "progress_message, progress_pct, rq_job_id, created_at, updated_at" + ) + if include_details: + cols += ", log_output, result_summary" conditions: List[str] = [] params: List[Any] = [] if session_id is not None: @@ -469,13 +480,13 @@ def list_recent_tasks( ) params.append(int(since_days)) where = (" WHERE " + " AND ".join(conditions)) if conditions else "" - params.append(limit) + params.extend([max(0, int(limit)), max(0, int(offset))]) cur.execute( f""" SELECT {cols} FROM tasks{where} ORDER BY created_at DESC - LIMIT %s + LIMIT %s OFFSET %s """, params, ) @@ -494,6 +505,45 @@ def list_recent_tasks( return rows +def count_recent_tasks( + session_id: Optional[str] = None, + since_days: Optional[int] = None, +) -> int: + """Return total task count for the same filter shape as ``list_recent_tasks``.""" + url = get_database_url() + if not url: + return 0 + try: + import psycopg2 + except ImportError: + return 0 + try: + conn = psycopg2.connect(url) + try: + with conn.cursor() as cur: + conditions: List[str] = [] + params: List[Any] = [] + if session_id is not None: + conditions.append("session_id = %s") + params.append(session_id) + if since_days is not None: + conditions.append( + "created_at >= NOW() - (%s::integer * INTERVAL '1 day')" + ) + params.append(int(since_days)) + where = (" WHERE " + " AND ".join(conditions)) if conditions else "" + cur.execute( + f"SELECT COUNT(*) FROM tasks{where}", + params, + ) + row = cur.fetchone() + return int(row[0]) if row and row[0] is not None else 0 + finally: + conn.close() + except Exception: + return 0 + + def delete_task(task_id: str, session_id: Optional[str] = None) -> bool: """Delete a task row. For pending/running, cancels the RQ job first when ``rq_job_id`` is set.""" url = get_database_url() diff --git a/evaluation_dashboard_app/lib/deploy_debug.py b/evaluation_dashboard_app/lib/deploy_debug.py index 0edeb76..d45f024 100644 --- a/evaluation_dashboard_app/lib/deploy_debug.py +++ b/evaluation_dashboard_app/lib/deploy_debug.py @@ -157,6 +157,109 @@ def task_counts_by_status() -> Tuple[bool, str, Optional[Dict[str, int]]]: return False, str(e), None +def database_table_overview() -> Tuple[bool, str, Optional[List[Dict[str, Any]]]]: + """Return public table names with approximate row counts for DB debugging.""" + if not get_database_url(): + return False, "DATABASE_URL is not set", None + with get_connection() as conn: + if conn is None: + return False, "No database connection", None + try: + with conn.cursor() as cur: + cur.execute( + """ + SELECT + t.table_name, + COALESCE(c.reltuples::bigint, 0) AS estimated_rows, + CASE WHEN c.oid IS NULL THEN 0 ELSE pg_total_relation_size(c.oid) END AS total_bytes + FROM information_schema.tables t + LEFT JOIN pg_namespace n ON n.nspname = t.table_schema + LEFT JOIN pg_class c ON c.relname = t.table_name AND c.relnamespace = n.oid + WHERE t.table_schema = 'public' + AND t.table_type = 'BASE TABLE' + ORDER BY t.table_name + """ + ) + rows = [ + { + "table_name": str(r[0]), + "estimated_rows": int(r[1] or 0), + "total_bytes": int(r[2] or 0), + } + for r in cur.fetchall() + ] + return True, "OK", rows + except Exception as e: + return False, str(e), None + + +def database_recent_task_rows( + *, + limit: int = 50, + offset: int = 0, + status: Optional[str] = None, + task_type: Optional[str] = None, + search: Optional[str] = None, +) -> Tuple[bool, str, List[Dict[str, Any]], int]: + """Read recent rows from the task table for the deployment debug DB tab.""" + if not get_database_url(): + return False, "DATABASE_URL is not set", [], 0 + with get_connection() as conn: + if conn is None: + return False, "No database connection", [], 0 + try: + from psycopg2.extras import RealDictCursor + except ImportError: + return False, "psycopg2 not installed", [], 0 + + where_parts: List[str] = [] + params: List[Any] = [] + if status: + where_parts.append("status = %s") + params.append(status) + if task_type: + where_parts.append("type = %s") + params.append(task_type) + if search: + needle = f"%{search.strip()}%" + where_parts.append( + """ + ( + id::text ILIKE %s OR type ILIKE %s OR status ILIKE %s OR + COALESCE(session_id, '') ILIKE %s OR COALESCE(rq_job_id, '') ILIKE %s OR + COALESCE(result_path, '') ILIKE %s OR COALESCE(error_message, '') ILIKE %s OR + COALESCE(parameters::text, '') ILIKE %s OR COALESCE(result_summary, '') ILIKE %s + ) + """ + ) + params.extend([needle] * 9) + + where_sql = (" WHERE " + " AND ".join(where_parts)) if where_parts else "" + capped_limit = max(1, min(int(limit), 500)) + safe_offset = max(0, int(offset)) + try: + with conn.cursor(cursor_factory=RealDictCursor) as cur: + cur.execute(f"SELECT COUNT(*) FROM tasks{where_sql}", params) + total_row = cur.fetchone() + total = int(total_row["count"] if total_row else 0) + cur.execute( + f""" + SELECT + id, type, status, session_id, rq_job_id, + created_at, updated_at, progress_pct, progress_message, + result_path, error_message, parameters, result_summary, log_output + FROM tasks{where_sql} + ORDER BY created_at DESC + LIMIT %s OFFSET %s + """, + [*params, capped_limit, safe_offset], + ) + rows = [dict(row) for row in cur.fetchall()] + return True, "OK", rows, total + except Exception as e: + return False, str(e), [], 0 + + def docker_unix_socket_for_check() -> Optional[str]: """Path to Unix socket for existence check, or None if DOCKER_HOST is non-Unix (e.g. tcp).""" host = os.environ.get("DOCKER_HOST", "").strip() @@ -223,12 +326,25 @@ def list_containers_for_debug(client) -> Tuple[List[Dict[str, str]], Optional[st rows: List[Dict[str, str]] = [] for c in containers: cid = c.id or "" + attrs = getattr(c, "attrs", None) or {} + state = attrs.get("State") or {} + state_status = (state.get("Status") or getattr(c, "status", "") or "").strip() + health_obj = state.get("Health") or {} + health_s = (health_obj.get("Status") or "").strip() + labels = (attrs.get("Config") or {}).get("Labels") or {} + if not isinstance(labels, dict): + labels = {} + compose_service = (labels.get("com.docker.compose.service") or "").strip() + compose_project = (labels.get("com.docker.compose.project") or "").strip() rows.append( { "id": cid[:12] if len(cid) >= 12 else cid, "full_id": cid, "name": (c.name or "").lstrip("/"), - "status": getattr(c, "status", "") or "", + "state": state_status or "unknown", + "health": health_s if health_s else "—", + "compose_service": compose_service or "—", + "compose_project": compose_project or "—", "image": c.image.tags[0] if c.image and c.image.tags else (c.image.id[:12] if c.image else ""), } ) diff --git a/evaluation_dashboard_app/lib/detection_stats_debug.py b/evaluation_dashboard_app/lib/detection_stats_debug.py new file mode 100644 index 0000000..23c1df4 --- /dev/null +++ b/evaluation_dashboard_app/lib/detection_stats_debug.py @@ -0,0 +1,160 @@ +""" +Optional verbose logging for pages/3_Detection_Stats.py (502 / freeze / OOM debugging). + +Enable with environment variable: + EVAL_DETECTION_STATS_DEBUG=1 + +Logs go to stderr (visible in `docker compose logs streamlit1`). +""" + +from __future__ import annotations + +import logging +import os +import resource +import sys +import time +import traceback +from contextlib import contextmanager +from typing import Any, List, Tuple + +_LOG = logging.getLogger("eval_dashboard.detection_stats") +_CONFIGURED = False + + +def detection_stats_debug_enabled() -> bool: + v = os.environ.get("EVAL_DETECTION_STATS_DEBUG", "").strip().lower() + return v in ("1", "true", "yes", "on") + + +def _ensure_logging() -> None: + global _CONFIGURED + if not detection_stats_debug_enabled(): + return + if _CONFIGURED: + return + _LOG.setLevel(logging.DEBUG) + h = logging.StreamHandler(sys.stderr) + h.setFormatter( + logging.Formatter("%(asctime)s [%(levelname)s] detection_stats: %(message)s") + ) + _LOG.addHandler(h) + _LOG.propagate = False + _CONFIGURED = True + + +def ds_dlog(fmt: str, *args: Any) -> None: + """Log one line when debug is enabled.""" + if not detection_stats_debug_enabled(): + return + _ensure_logging() + try: + _LOG.info(fmt, *args) + except Exception: + _LOG.info("%s %s", fmt, args) + + +def ds_debug_init_session_state(session_state: Any) -> None: + """Call once per script run (after set_page_config). Resets timing buffer.""" + if not detection_stats_debug_enabled(): + return + session_state["_ds_debug_timings"] = [] + session_state["_ds_debug_run_started"] = time.perf_counter() + ds_dlog("=== Detection Stats script run started ===") + ds_dlog("pid=%s argv[0]=%s", os.getpid(), sys.argv[0] if sys.argv else "") + for key in ( + "EVAL_DETECTION_STATS_DEBUG", + "STREAMLIT_SERVER_COOKIE_SECRET", + "EVAL_DASHBOARD_DATA_ROOT", + ): + v = os.environ.get(key) + if key == "STREAMLIT_SERVER_COOKIE_SECRET" and v: + ds_dlog("env %s=(set len=%s)", key, len(v)) + else: + ds_dlog("env %s=%r", key, v) + + +def ds_debug_log_memory(note: str = "") -> None: + if not detection_stats_debug_enabled(): + return + try: + ru = resource.getrusage(resource.RUSAGE_SELF) + # Linux: ru_maxrss kilobytes; macOS: bytes (best-effort label) + ds_dlog( + "MEM %s ru_maxrss=%s ru_utime=%.3fs ru_stime=%.3fs", + note, + ru.ru_maxrss, + ru.ru_utime, + ru.ru_stime, + ) + except Exception as e: + ds_dlog("MEM %s (unavailable: %s)", note, e) + + +def _append_timing(session_state: Any, name: str, seconds: float) -> None: + if not detection_stats_debug_enabled(): + return + lst = session_state.get("_ds_debug_timings") + if not isinstance(lst, list): + lst = [] + session_state["_ds_debug_timings"] = lst + lst.append((name, seconds)) + + +@contextmanager +def ds_dtimer(name: str, session_state: Any): + """Time a block; record to session_state for the debug expander.""" + if not detection_stats_debug_enabled(): + yield + return + t0 = time.perf_counter() + ds_dlog("TIMER start %s", name) + try: + yield + finally: + dt = time.perf_counter() - t0 + ds_dlog("TIMER end %s (%.3fs)", name, dt) + _append_timing(session_state, name, dt) + + +def ds_debug_log_exception(where: str, exc: BaseException) -> None: + if not detection_stats_debug_enabled(): + return + _ensure_logging() + _LOG.exception("EXCEPTION in %s: %s", where, exc) + + +def ds_debug_render_expander(session_state: Any) -> None: + """Renders a Streamlit expander with timings + env (only if debug on).""" + import streamlit as st + + if not detection_stats_debug_enabled(): + return + t_run = session_state.get("_ds_debug_run_started") + total_s = None + if isinstance(t_run, (int, float)): + total_s = time.perf_counter() - float(t_run) + + timings: List[Tuple[str, float]] = session_state.get("_ds_debug_timings") or [] + lines = [ + f"Total wall time (approx): {total_s:.3f}s" if total_s is not None else "Total wall time: n/a", + "", + "Section timings (seconds):", + ] + for name, sec in timings: + lines.append(f" - {name}: {sec:.3f}s") + if not timings: + lines.append(" (no ds_dtimer sections recorded)") + + lines.extend( + [ + "", + "Environment (subset):", + f" EVAL_DETECTION_STATS_DEBUG={os.environ.get('EVAL_DETECTION_STATS_DEBUG', '')!r}", + f" EVAL_DASHBOARD_DATA_ROOT={os.environ.get('EVAL_DASHBOARD_DATA_ROOT', '')!r}", + ] + ) + + with st.expander("Detection Stats debug (EVAL_DETECTION_STATS_DEBUG=1)", expanded=False): + st.code("\n".join(lines), language="text") + st.caption("Check `docker compose logs streamlit1` for the same lines on stderr.") diff --git a/evaluation_dashboard_app/lib/docker_live_structure.py b/evaluation_dashboard_app/lib/docker_live_structure.py new file mode 100644 index 0000000..9274c87 --- /dev/null +++ b/evaluation_dashboard_app/lib/docker_live_structure.py @@ -0,0 +1,243 @@ +""" +Mermaid source for the Deployment debug Docker tab: same subgraph layout as Readme.md (Help). + +Clients → Edge → App Tier → T4 dataset server (optional) → Infrastructure → Workers → Host data, +with live container labels. T4 may be a Compose service (e.g. ``t4_server``) or an external HTTP +endpoint from ``T4_VISUALIZER_BASE_URL`` (synthetic node). +""" + +from __future__ import annotations + +import os +from collections import defaultdict +from typing import Dict, List, Optional +from urllib.parse import urlparse + + +def _by_compose_service(rows: List[Dict[str, str]]) -> Dict[str, List[int]]: + by: Dict[str, List[int]] = defaultdict(list) + for i, r in enumerate(rows): + svc = (r.get("compose_service") or "").strip() + if svc and svc != "—": + by[svc].append(i) + return by + + +def _mermaid_plain(s: str, max_len: int) -> str: + return (s or "")[:max_len].replace('"', "'").replace("\n", " ").replace("#", " ") + + +def _row_mermaid_label(r: Dict[str, str]) -> str: + name = _mermaid_plain(r.get("name"), 38) + stt = _mermaid_plain(r.get("state"), 14) + svc = _mermaid_plain(r.get("compose_service"), 18) or "—" + hl = (r.get("health") or "").strip() + if hl and hl != "—": + return f"{name}
{stt} · {svc}
{_mermaid_plain(hl, 14)}" + return f"{name}
{stt} · {svc}" + + +def _row_class(r: Dict[str, str]) -> str: + s = (r.get("state") or "").lower() + if s == "running": + return "run" + if s in ("exited", "dead"): + return "x" + return "o" + + +def _nid(i: int) -> str: + return f"N{i}" + + +def _nid_list(idxs: List[int]) -> Optional[str]: + if not idxs: + return None + return " & ".join(_nid(i) for i in idxs) + + +def _is_t4_compose_service(svc: str) -> bool: + s = (svc or "").strip().lower() + if not s or s == "—": + return False + if s in ("t4_visualizer", "t4_server", "t4_visualizer_server", "t4"): + return True + return s.startswith("t4_") + + +def rowset_has_t4_compose_service(rows: List[Dict[str, str]]) -> bool: + """True if any listed container is classified as the T4 dataset server (Compose service name).""" + return any(_is_t4_compose_service(str(r.get("compose_service") or "")) for r in rows) + + +def _t4_url_display(url: str, *, max_len: int = 52) -> str: + """Short label for Mermaid (host:port or truncated URL).""" + u = (url or "").strip() + if not u: + return "(not set)" + try: + p = urlparse(u) + if p.netloc: + out = p.netloc + else: + out = u + except Exception: + out = u + out = out.replace('"', "'") + return out if len(out) <= max_len else out[: max_len - 1] + "…" + + +T4_SYNTHETIC_NODE = "T4SYN" + + +def live_containers_mermaid( + rows: List[Dict[str, str]], + *, + t4_visualizer_base_url: Optional[str] = None, +) -> str: + """ + flowchart LR with subgraphs matching Help / Readme.md: + Clients, Edge, App Tier, optional T4 dataset server, Infrastructure, Workers, Host data — + plus live labels per container. External T4 HTTP API appears as a synthetic node when + ``T4_VISUALIZER_BASE_URL`` is set and no matching Compose service is listed. + """ + if t4_visualizer_base_url is None: + t4_visualizer_base_url = os.environ.get("T4_VISUALIZER_BASE_URL", "").strip() or None + + if not rows: + return 'flowchart LR\n _empty["No containers in filter"]' + + by = _by_compose_service(rows) + nginx = sorted(by.get("nginx", []), key=lambda i: rows[i].get("name", "")) + st: List[int] = [] + for svc in sorted(s for s in by if s.startswith("streamlit")): + st.extend(sorted(by[svc], key=lambda i: rows[i].get("name", ""))) + redis = sorted(by.get("redis", []), key=lambda i: rows[i].get("name", "")) + pg = sorted(by.get("postgres", []), key=lambda i: rows[i].get("name", "")) + init = sorted(by.get("init_db", []), key=lambda i: rows[i].get("name", "")) + workers = sorted(by.get("worker", []), key=lambda i: rows[i].get("name", "")) + t4: List[int] = [] + for svc, idxs in by.items(): + if _is_t4_compose_service(svc): + t4.extend(sorted(idxs, key=lambda i: rows[i].get("name", ""))) + t4 = sorted(set(t4), key=lambda i: rows[i].get("name", "")) + use_synthetic_t4 = bool(t4_visualizer_base_url) and not t4 + known = set(nginx + st + redis + pg + init + workers + t4) + other = [i for i in range(len(rows)) if i not in known] + + def node_line(i: int) -> str: + r = rows[i] + return f' {_nid(i)}["{_row_mermaid_label(r)}"]:::{_row_class(r)}' + + lines: List[str] = [ + "flowchart LR", + " classDef run fill:#c8e6c9,stroke:#2e7d32", + " classDef x fill:#ffcdd2,stroke:#c62828", + " classDef o fill:#e0e0e0,stroke:#616161", + " classDef syn fill:#e3f2fd,stroke:#1565c0", + ' subgraph clients ["Clients"]', + " BR[Browser]:::syn", + " end", + ] + + if nginx: + lines.append(' subgraph edge ["Edge"]') + for i in nginx: + lines.append(node_line(i)) + lines.append(" end") + + if st: + lines.append(' subgraph app ["App Tier"]') + for i in st: + lines.append(node_line(i)) + lines.append(" end") + + if t4 or use_synthetic_t4: + lines.append(' subgraph t4tier ["T4 dataset server"]') + if t4: + for i in t4: + lines.append(node_line(i)) + else: + t4_lab = _mermaid_plain( + f"T4 visualizer (HTTP)
{_t4_url_display(t4_visualizer_base_url or '')}", + 120, + ) + lines.append(f' {T4_SYNTHETIC_NODE}["{t4_lab}"]:::syn') + lines.append(" end") + + infra = redis + pg + init + if infra: + lines.append(' subgraph infra ["Infrastructure"]') + for i in infra: + lines.append(node_line(i)) + lines.append(" end") + + if workers: + lines.append(' subgraph workers ["Workers"]') + for i in workers: + lines.append(node_line(i)) + lines.append(" end") + + lines.append(' subgraph volumes ["Host data"]') + lines.append(' DR[Data root
bind-mounted data]:::syn') + lines.append(" end") + + if other: + lines.append(' subgraph misc ["Other"]') + for i in other: + lines.append(node_line(i)) + lines.append(" end") + + lines.append("") + lines.append(" %% Same topology as Readme.md Help") + + nl_nginx = _nid_list(nginx) + nl_st = _nid_list(st) + nl_redis = _nid_list(redis) + nl_pg = _nid_list(pg) + nl_workers = _nid_list(workers) + nl_t4: Optional[str] + if t4: + nl_t4 = _nid_list(t4) + elif use_synthetic_t4: + nl_t4 = T4_SYNTHETIC_NODE + else: + nl_t4 = None + + if nl_nginx: + lines.append(f" BR --> {nl_nginx}") + if nl_st: + for i in nginx: + lines.append(f" {_nid(i)} --> {nl_st}") + elif nl_st: + lines.append(f" BR --> {nl_st}") + + for i in st: + if nl_redis: + lines.append(f" {_nid(i)} --> {nl_redis}") + if nl_pg: + lines.append(f" {_nid(i)} --> {nl_pg}") + if nl_t4: + lines.append(f" {_nid(i)} --> {nl_t4}") + + for i in redis: + if nl_workers: + lines.append(f" {_nid(i)} --> {nl_workers}") + + for i in workers: + if nl_pg: + lines.append(f" {_nid(i)} --> {nl_pg}") + lines.append(f" {_nid(i)} --> DR") + + if nl_t4: + if t4: + for i in t4: + lines.append(f" {_nid(i)} --> DR") + else: + lines.append(f" {T4_SYNTHETIC_NODE} --> DR") + + for i in init: + for j in pg: + lines.append(f" {_nid(i)} -.-> {_nid(j)}") + + return "\n".join(lines) diff --git a/evaluation_dashboard_app/lib/download_core.py b/evaluation_dashboard_app/lib/download_core.py index 8b2dac9..4c59985 100644 --- a/evaluation_dashboard_app/lib/download_core.py +++ b/evaluation_dashboard_app/lib/download_core.py @@ -9,6 +9,7 @@ import os import shutil import urllib.parse +from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from collections import Counter from typing import Any, Callable, Dict, List, Optional @@ -24,6 +25,19 @@ API_BASE_URL = "https://evaluation.ci.web.auto/v3" +def _compact_eval_path(path: Any, *, parts: int = 2) -> str: + """Return a readable tail path for progress/log messages.""" + text = str(path or "").strip() + if not text: + return "unknown" + try: + p = Path(text) + tail = p.parts[-parts:] + return "/".join(tail) if tail else text + except Exception: + return text + + def _make_evaluator_session(environment: str = DEFAULT_ENVIRONMENT): """Build authenticated session for evaluation.ci.web.auto API (no Streamlit).""" os.environ["AUTH_PROFILE"] = environment @@ -144,6 +158,8 @@ def get_case_simulation_log_info( continue if "simulation_archive" not in report.get("logs", {}): continue + if "simulation_result_json" not in report.get("logs", {}): + continue scenario_params = report.get("scenario_parameters") or {} result.append({ "suite_id": sid, @@ -528,3 +544,196 @@ def run_download_scenarios( organize_files_into_directories(out_dir) total_attempted = len(log_dicts) return (failure_count, total_attempted, rows) + + +def run_download_and_eval( + project_id: str, + job_id: str, + suite_id: Optional[str], + output_path: str, + download_type: str = "archives", + phase: str = "perception.object_recognition.tracking.objects", + *, + skip_large_file: bool = False, + large_file_mb: float = 50.0, + keep_zip_files: bool = False, + suite_ids: Optional[List[str]] = None, + run_eval: bool = True, + generate_parquet: bool = True, + eval_recursive: bool = True, + eval_overwrite: bool = False, + eval_workers: int = 4, + on_progress: Optional[Callable[[str], None]] = None, + on_warning: Optional[Callable[[str], None]] = None, +) -> Dict[str, Any]: + """ + Combined workflow: Download results, then optionally run eval and generate parquet. + + Returns dict with: + - download_success: bool + - download_summary: dict with success/fail counts + - eval_summary: dict with directories_processed, etc. (if run_eval=True) + - parquet_path: str (if generate_parquet=True) + """ + from lib import eval_summary + + # Try to import parquet generation + pkl_archive_to_parquet = None + try: + from lib.perception_catalog_io import pkl_archive_to_parquet as _p2p + pkl_archive_to_parquet = _p2p + except ImportError: + pass + + result: Dict[str, Any] = { + "download_success": False, + "download_summary": {}, + "eval_summary": {}, + "parquet_path": "", + "errors": [], + } + + # Step 1: Download + if on_progress: + on_progress("Starting download phase...") + + try: + failure_count, total_attempted, rows = run_download_results( + project_id=project_id, + job_id=job_id, + suite_id=suite_id, + output_path=output_path, + download_type=download_type, + phase=phase, + skip_large_file=skip_large_file, + large_file_mb=large_file_mb, + keep_zip_files=keep_zip_files, + suite_ids=suite_ids, + on_progress=on_progress, + on_warning=on_warning, + ) + success_count = total_attempted - failure_count + result["download_summary"] = { + "total": total_attempted, + "success": success_count, + "failed": failure_count, + "rows": rows, + } + + # Check if download was successful (at least some files downloaded) + result["download_success"] = success_count > 0 + if failure_count > 0 and success_count == 0: + result["errors"].append(f"Download failed: {failure_count} of {total_attempted} scenarios failed") + return result + if success_count == 0: + result["errors"].append("Download: No scenarios were successfully downloaded") + return result + + except Exception as e: + result["errors"].append(f"Download exception: {e}") + return result + + # Step 2: Run eval (if requested and download succeeded) + if run_eval and result["download_success"]: + if on_progress: + on_progress("Download complete. Starting eval phase...") + + try: + eval_root = output_path + target_dirs = eval_summary.find_eval_result_dirs(eval_root, recursive=eval_recursive) + if target_dirs: + total = len(target_dirs) + eval_statuses: List[Dict[str, Any]] = [] + try: + requested_workers = int(eval_workers or 1) + except (TypeError, ValueError): + requested_workers = 1 + workers = max(1, min(requested_workers, total)) + if on_progress: + on_progress(f"Eval: completed 0/{total} dirs") + + def _record_status(status: Dict[str, Any], done: int, fallback_path: str) -> None: + eval_statuses.append(status) + state = str(status.get("status") or "failed") + short_path = _compact_eval_path(status.get("path") or fallback_path) + if on_progress: + on_progress(f"Eval: completed {done}/{total} dirs - {state}: {short_path}") + if state == "failed" and on_warning: + on_warning(f"Eval failed for {status.get('path', '')}: {status.get('detail', '')}") + + if workers == 1: + for i, result_dir in enumerate(target_dirs): + status = eval_summary.run_eval_result_for_dir(result_dir, overwrite=eval_overwrite) + _record_status(status, i + 1, result_dir) + else: + if on_progress: + on_progress(f"Eval: running {total} dirs with {workers} worker(s)") + with ThreadPoolExecutor(max_workers=workers) as executor: + future_map = { + executor.submit( + eval_summary.run_eval_result_for_dir, + result_dir, + overwrite=eval_overwrite, + ): result_dir + for result_dir in target_dirs + } + for done, future in enumerate(as_completed(future_map), start=1): + result_dir = future_map[future] + try: + status = future.result() + except Exception as exc: + status = {"path": result_dir, "status": "failed", "detail": str(exc)} + _record_status(status, done, result_dir) + + # Generate summary CSVs + csv_info = eval_summary.generate_summary_and_score_csv(eval_root) + failed = [s for s in eval_statuses if s.get("status") == "failed"] + skipped = [s for s in eval_statuses if s.get("status") == "skipped"] + succeeded = [s for s in eval_statuses if s.get("status") == "success"] + result["eval_summary"] = { + "directories_processed": total, + "success": len(succeeded), + "failed": len(failed), + "skipped": len(skipped), + "summary_path": csv_info.get("summary_path", eval_root), + "summary_rows": csv_info.get("summary_rows", 0), + "score_rows": csv_info.get("score_rows", 0), + } + if failed: + first = failed[0] + result["errors"].append( + f"Eval failed for {len(failed)} of {total} directories; " + f"first: {first.get('path', '')} ({first.get('detail', '')})" + ) + else: + if on_warning: + on_warning("No eval result directories found") + except Exception as e: + result["errors"].append(f"Eval exception: {e}") + + # Step 3: Generate parquet (if requested and download succeeded) + if generate_parquet and result["download_success"] and pkl_archive_to_parquet: + if on_progress: + on_progress("Generating parquet...") + + def _on_parquet_progress(done: int, total: int) -> None: + if on_progress: + on_progress(f"Parquet: Processing {done}/{total} pkl files") + + def _on_parquet_skip(path: str, reason: str) -> None: + if on_warning: + on_warning(f"Parquet skipped {path}: {reason}") + + try: + parquet_path = pkl_archive_to_parquet( + output_path, + on_progress=_on_parquet_progress, + on_skip=_on_parquet_skip, + project_id=project_id, + job_id=job_id, + ) + result["parquet_path"] = parquet_path + except Exception as e: + result["errors"].append(f"Parquet exception: {e}") + + return result diff --git a/evaluation_dashboard_app/lib/eval_summary.py b/evaluation_dashboard_app/lib/eval_summary.py index 4080003..f198d14 100644 --- a/evaluation_dashboard_app/lib/eval_summary.py +++ b/evaluation_dashboard_app/lib/eval_summary.py @@ -5,12 +5,38 @@ import glob import json import os +import signal +import subprocess +import sys +import tempfile from pathlib import Path from typing import Any, Dict, List from lib.perception_eval_result_summarizer import run_eval_result, generate_score_json +def _write_text_atomic(path: str, content: str) -> None: + """Write text by replacing the target, so read-only existing files do not block writable dirs.""" + target = Path(path) + tmp_name = "" + try: + with tempfile.NamedTemporaryFile( + "w", + encoding="utf-8", + dir=os.fspath(target.parent), + delete=False, + ) as f: + tmp_name = f.name + f.write(content) + os.replace(tmp_name, target) + finally: + if tmp_name and os.path.exists(tmp_name): + try: + os.unlink(tmp_name) + except OSError: + pass + + def find_eval_result_dirs(root_dir: str, recursive: bool = True) -> List[str]: """Return sorted list of directories under root_dir that contain scenario.yaml and scene_result.pkl.""" if not os.path.isdir(root_dir): @@ -28,8 +54,8 @@ def find_eval_result_dirs(root_dir: str, recursive: bool = True) -> List[str]: return sorted(result_dirs) -def run_eval_result_for_dir(result_dir: str, overwrite: bool = False) -> Dict[str, Any]: - """Run eval_result and generate score.json for one directory. Returns status dict.""" +def _run_eval_result_for_dir_inline(result_dir: str, overwrite: bool = False) -> Dict[str, Any]: + """Run eval_result in the current process and generate score.json for one directory.""" result_file = os.path.join(result_dir, "result.txt") score_file = os.path.join(result_dir, "score.json") if os.path.exists(result_file) and not overwrite: @@ -59,6 +85,89 @@ def run_eval_result_for_dir(result_dir: str, overwrite: bool = False) -> Dict[st return {"path": result_dir, "status": "failed", "detail": str(e)} +def _signal_detail(returncode: int) -> str: + """Return a human-readable detail string for a subprocess return code.""" + if returncode < 0: + sig_num = -returncode + elif returncode > 128: + sig_num = returncode - 128 + else: + return f"exit code {returncode}" + try: + sig_name = signal.Signals(sig_num).name + except ValueError: + sig_name = f"signal {sig_num}" + return f"{sig_name} ({sig_num})" + + +def _write_eval_subprocess_failure( + result_dir: str, + message: str, + stdout: str = "", + stderr: str = "", +) -> None: + """Persist native-crash details where the UI and user can inspect them.""" + result_path = Path(result_dir) / "result.txt" + log_path = Path(result_dir) / "eval_subprocess.log" + detail = f"Error: {message}\n" + with open(result_path, "w", encoding="utf-8") as f: + f.write(detail) + with open(log_path, "w", encoding="utf-8") as f: + f.write(detail) + if stdout: + f.write("\n--- stdout ---\n") + f.write(stdout) + if stderr: + f.write("\n--- stderr ---\n") + f.write(stderr) + + +def _run_eval_result_for_dir_subprocess(result_dir: str, overwrite: bool = False) -> Dict[str, Any]: + """Run one scenario eval in a child Python process so native crashes are contained.""" + env = os.environ.copy() + env.setdefault("PYTHONFAULTHANDLER", "1") + cmd = [ + sys.executable, + "-m", + "lib.eval_summary", + "__run_eval_dir", + result_dir, + "1" if overwrite else "0", + ] + completed = subprocess.run( + cmd, + cwd=os.fspath(Path(__file__).resolve().parents[1]), + env=env, + text=True, + capture_output=True, + ) + if completed.returncode == 0: + for line in reversed(completed.stdout.splitlines()): + if line.startswith("__EVAL_RESULT_JSON__"): + try: + return json.loads(line.removeprefix("__EVAL_RESULT_JSON__")) + except json.JSONDecodeError: + break + return {"path": result_dir, "status": "success", "detail": "completed"} + + detail = f"eval subprocess failed with {_signal_detail(completed.returncode)}" + _write_eval_subprocess_failure( + result_dir, + detail, + stdout=completed.stdout, + stderr=completed.stderr, + ) + return {"path": result_dir, "status": "failed", "detail": detail} + + +def run_eval_result_for_dir(result_dir: str, overwrite: bool = False) -> Dict[str, Any]: + """Run eval_result and generate score.json for one directory. Returns status dict.""" + isolated = os.environ.get("EVAL_RUN_ISOLATED_SUBPROCESS", "1").lower() + if isolated in ("0", "false", "no"): + return _run_eval_result_for_dir_inline(result_dir, overwrite=overwrite) + return _run_eval_result_for_dir_subprocess(result_dir, overwrite=overwrite) + + def generate_summary_and_score_csv(input_path: str) -> Dict[str, Any]: """ Generate Summary.csv and Score.csv in input_path from each subdirectory's result.txt and score.json. @@ -73,6 +182,43 @@ def _infer_suite_name(dir_name: str) -> str: return parts[0] return base + def _dataset_id_from_case_dir(case_dir: str) -> str: + """Resolve the real T4 dataset id for Score.csv; blank if unavailable.""" + case_path = Path(case_dir) + metadata_path = case_path / "t4_metadata.json" + if metadata_path.exists(): + try: + with open(metadata_path, "r", encoding="utf-8") as f: + meta = json.load(f) + dataset_id = str(meta.get("t4_dataset_id") or "").strip() + if dataset_id: + return dataset_id + except (OSError, json.JSONDecodeError, TypeError, AttributeError): + pass + + scenario_path = case_path / "scenario.yaml" + if scenario_path.exists(): + try: + import yaml + + with open(scenario_path, "r", encoding="utf-8") as f: + scenario = yaml.safe_load(f) or {} + datasets = scenario.get("Evaluation", {}).get("Datasets", []) + if isinstance(datasets, list): + for item in datasets: + if isinstance(item, dict) and item: + dataset_id = str(next(iter(item.keys())) or "").strip() + if dataset_id: + return dataset_id + elif isinstance(datasets, dict): + dataset_id = str(next(iter(datasets.keys()), "") or "").strip() + if dataset_id: + return dataset_id + except (ImportError, OSError, TypeError, AttributeError): + pass + + return "" + result_folders = glob.glob(os.path.join(input_path, "*/")) result_folders.sort() result_entries: List[Dict[str, str]] = [] @@ -94,6 +240,14 @@ def _infer_suite_name(dir_name: str) -> str: summary_lines: List[str] = [] score_lines: List[str] = [] + score_header = "Scenario, Dataset, Option, GT_OBJ," + for _ in range(4): + score_header += ( + "Distance, NM, TP/TN, ADD, AIL, UIL, PFN/PFP, UUID Num, " + "Practical Pass Rate, MAX_DIST_THRESH,OBJ_CNTS," + ) + score_header += "\n" + for entry in result_entries: folder = entry["path"] suite_name = entry["suite"] @@ -145,7 +299,11 @@ def _infer_suite_name(dir_name: str) -> str: with open(score_json_path, "r", encoding="utf-8") as f: dic = json.load(f) - line = f"{Path(folder).name}," + folder_name = Path(folder).name + dataset_id = _dataset_id_from_case_dir(folder) + + line = f"{folder_name}," + line += f"{dataset_id}," line += f"{dic.get('Option', '')}," line += f"{dic.get('criteria0', {}).get('GT_OBJ', '')}," @@ -184,17 +342,14 @@ def _infer_suite_name(dir_name: str) -> str: obj_cnts = v.get("OBJ_CNTS", {}) if isinstance(obj_cnts, dict): - obj_parts = [f"{obj}:{cnt}" for obj, cnt in obj_cnts.items()] - line += ";".join(obj_parts) - if not is_last: - line += "," + obj_parts = [f"{obj}:{cnt};" for obj, cnt in obj_cnts.items()] + line += "".join(obj_parts) + line += "," score_lines.append(line + "\n") - with open(os.path.join(input_path, "Summary.csv"), mode="w", encoding="utf-8") as f: - f.writelines(summary_lines) - with open(os.path.join(input_path, "Score.csv"), mode="w", encoding="utf-8") as f: - f.writelines(score_lines) + _write_text_atomic(os.path.join(input_path, "Summary.csv"), "".join(summary_lines)) + _write_text_atomic(os.path.join(input_path, "Score.csv"), score_header + "".join(score_lines)) return { "summary_path": os.path.join(input_path, "Summary.csv"), @@ -202,3 +357,18 @@ def _infer_suite_name(dir_name: str) -> str: "summary_rows": len(summary_lines), "score_rows": len(score_lines), } + + +def _main() -> int: + if len(sys.argv) >= 2 and sys.argv[1] == "__run_eval_dir": + result_dir = sys.argv[2] + overwrite = len(sys.argv) >= 4 and sys.argv[3] == "1" + result = _run_eval_result_for_dir_inline(result_dir, overwrite=overwrite) + print("__EVAL_RESULT_JSON__" + json.dumps(result, ensure_ascii=False)) + return 0 + print("Usage: python -m lib.eval_summary __run_eval_dir ", file=sys.stderr) + return 2 + + +if __name__ == "__main__": + raise SystemExit(_main()) diff --git a/evaluation_dashboard_app/lib/evaluator_api.py b/evaluation_dashboard_app/lib/evaluator_api.py new file mode 100644 index 0000000..726a369 --- /dev/null +++ b/evaluation_dashboard_app/lib/evaluator_api.py @@ -0,0 +1,636 @@ +""" +Evaluator API wrapper for job scheduling and status polling. +Based on evaluator_run_api.py from EvaluatorRunnerUITest, extended with polling support. +""" + +from __future__ import annotations + +import json +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Optional + +import requests +import webautoauth.requests +from requests.adapters import HTTPAdapter +from urllib3.util import Retry + +EVALUATION_API_BASE_URL = "https://evaluation.ci.web.auto/v3" +EVALUATION_REPORT_BASE_URL = "https://evaluation.tier4.jp/evaluation/reports" +DEFAULT_WEBAUTO_AUTH_PATH = Path.home() / ".webauto" / "auth.toml" +SUCCESS_JOB_STATUSES = frozenset({"succeeded", "success"}) +FAILED_JOB_STATUSES = frozenset( + { + "failed", + "failure", + "error", + "canceled", + "cancelled", + "aborted", + "timed_out", + "timeout", + } +) +TERMINAL_JOB_STATUSES = SUCCESS_JOB_STATUSES | FAILED_JOB_STATUSES +_TEST_STATUS_PATHS = (("test", "status"),) +_OVERALL_STATUS_PATHS = ( + ("job", "status"), + ("evaluation", "status"), + ("status",), + ("state",), +) +_BUILD_STATUS_PATHS = (("build", "status"),) + + +@dataclass(frozen=True) +class TestCaseDefinition: + test_id: str + project_id: str + catalog_id: str + integration_id: str + suite_ids: list[str] + catalog_display_name_prefix: str = "" + + +class EvaluationAPIError(RuntimeError): + """Raised when the evaluation API returns an unexpected response.""" + + +def normalize_job_status(status: Any) -> str: + if status is None: + return "" + return str(status).strip().lower() + + +def _get_first_status(report: dict[str, Any], paths: tuple[tuple[str, ...], ...]) -> str: + for path in paths: + current: Any = report + for key in path: + if not isinstance(current, dict): + current = None + break + current = current.get(key) + + status = normalize_job_status(current) + if status: + return status + + return "" + + +def extract_job_status(report: dict[str, Any]) -> str: + """Return the best evaluator status from known report response shapes.""" + if not isinstance(report, dict): + return "unknown" + + test_status = _get_first_status(report, _TEST_STATUS_PATHS) + if test_status: + return test_status + + overall_status = _get_first_status(report, _OVERALL_STATUS_PATHS) + if overall_status: + return overall_status + + build_status = _get_first_status(report, _BUILD_STATUS_PATHS) + if build_status: + return f"build:{build_status}" + + return "unknown" + + +def is_terminal_job_status(status: Any) -> bool: + return normalize_job_status(status) in TERMINAL_JOB_STATUSES + + +def is_success_job_status(status: Any) -> bool: + return normalize_job_status(status) in SUCCESS_JOB_STATUSES + + +def get_job_completion(report: dict[str, Any]) -> tuple[bool, str]: + """ + Return (is_completed, status) for an evaluator job report. + + Build success only means the build phase is done; evaluator jobs can still be + running suites/tests after that. Build failure is terminal because tests cannot + proceed, but build success must not unlock downloads by itself. + """ + if not isinstance(report, dict): + return False, "unknown" + + status = extract_job_status(report) + test_status = _get_first_status(report, _TEST_STATUS_PATHS) + if test_status: + return is_terminal_job_status(test_status), status + + overall_status = _get_first_status(report, _OVERALL_STATUS_PATHS) + if overall_status and is_terminal_job_status(overall_status): + return True, status + + build_status = _get_first_status(report, _BUILD_STATUS_PATHS) + if build_status in FAILED_JOB_STATUSES: + return True, status + + return False, status + + +def load_test_cases(path: Path | str) -> dict[str, dict[str, Any]]: + path = Path(path) + with path.open("r", encoding="utf-8") as file: + return json.load(file) + + +def resolve_test_case(test_id: str, source: Any) -> TestCaseDefinition: + test_cases = normalize_test_case_mapping(source) + if test_id not in test_cases: + raise KeyError(f"Unknown test_id: {test_id}") + data = test_cases[test_id] + return make_test_case_definition(test_id, data) + + +def make_test_case_definition(test_id: str, data: dict[str, Any]) -> TestCaseDefinition: + return TestCaseDefinition( + test_id=test_id, + project_id=data["project_id"], + catalog_id=data["catalog_id"], + integration_id=data["integration_id"], + suite_ids=list(data.get("suite_ids", [])), + catalog_display_name_prefix=data.get("catalog_display_name_prefix", ""), + ) + + +def normalize_test_case_mapping(source: Any) -> dict[str, dict[str, Any]]: + """Normalize a test-case source into a mapping keyed by test_id.""" + if isinstance(source, dict): + return source + if isinstance(source, (str, Path)): + return load_test_cases(Path(source)) + raise TypeError("test case source must be a dict or JSON file path") + + +def normalize_test_case_definition( + test_case: Any, *, test_id: str = "custom" +) -> TestCaseDefinition: + """Normalize one test case definition.""" + if isinstance(test_case, TestCaseDefinition): + return test_case + if isinstance(test_case, dict): + return make_test_case_definition(test_id, test_case) + raise TypeError("test_case must be a TestCaseDefinition or dict") + + +def get_job_report_url(project_id: str, job_id: str) -> str: + return f"{EVALUATION_REPORT_BASE_URL}/{job_id}/?project_id={project_id}" + + +def get_suite_report_url(project_id: str, job_id: str, suite_report_id: str) -> str: + return f"{EVALUATION_REPORT_BASE_URL}/{job_id}/tests/{suite_report_id}?project_id={project_id}" + + +def extract_job_id(url: str) -> str: + if "/reports/" in url: + url = url.split("/reports/")[1] + if "/" in url: + url = url.split("/")[0] + if "?" in url: + url = url.split("?")[0] + return url + + +def extract_project_id(url: str) -> str: + if "project_id=" in url: + return url.split("project_id=")[1] + return url + + +def _make_session(auth_path: Path | str | None = DEFAULT_WEBAUTO_AUTH_PATH): + """Build authenticated session for evaluation.ci.web.auto API.""" + headers = { + "Content-Type": "application/json", + "accept": "application/json", + } + if auth_path is not None: + auth_path = Path(auth_path).expanduser().resolve() + if not auth_path.exists(): + raise FileNotFoundError(f"webauto auth config not found: {auth_path}") + from webautoauth.token import HttpService, TokenSource, load_config + + config = load_config() + token_source = TokenSource(HttpService(config)) + session = webautoauth.requests.make_session(token_source) + presigned = requests.Session() + retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504]) + presigned.mount("http://", HTTPAdapter(max_retries=retries)) + presigned.mount("https://", HTTPAdapter(max_retries=retries)) + return session, presigned, headers + + +def get_evaluator_session(environment: str = "default"): + """Public API: same session as worker. Returns (session, presigned, headers).""" + import os + os.environ["AUTH_PROFILE"] = environment + return _make_session() + + +class EvaluationRunAPI: + """Minimal wrapper for scheduling evaluation jobs and collecting reports.""" + + def __init__( + self, + api_base_url: str = EVALUATION_API_BASE_URL, + *, + auth_path: Path | str | None = DEFAULT_WEBAUTO_AUTH_PATH, + test_cases: Optional[dict[str, dict[str, Any]]] = None, + ) -> None: + self.api_base_url = api_base_url.rstrip("/") + self._session, self._presigned, self._headers = _make_session(auth_path) + self.test_cases = test_cases or {} + + def request(self, url: str, params: Optional[dict[str, Any]] = None, method: str = "GET"): + if method == "GET": + from urllib.parse import urlencode + if params: + return self._session.get(f"{url}?{urlencode(params)}", headers=self._headers) + return self._session.get(url, headers=self._headers) + + if method == "POST": + if params is None: + return self._session.post(url, headers=self._headers) + return self._session.post( + url, + data=json.dumps(params).encode("utf-8"), + headers=self._headers, + ) + + raise ValueError(f"Unsupported method: {method}") + + def schedule_job( + self, + *, + project_id: str, + catalog_id: str, + integration_id: Optional[str] = None, + target_name: Optional[str] = None, + source_job_id: Optional[str] = None, + suite_ids: Optional[list[str]] = None, + max_retries: int = 1, + description: str = "no description", + clean_build: bool = False, + debug: bool = False, + release: bool = False, + record_caret: bool = False, + log_expiration_time_in_days: float = 14.0, + is_tag: bool = False, + ) -> dict[str, Any]: + if not source_job_id and not target_name: + raise ValueError("Either target_name or source_job_id must be provided.") + payload = { + "build_options": { + "clean_build": clean_build, + "debug": debug, + }, + "catalog_id": catalog_id, + "description": description, + "release": release, + "suite_ids": suite_ids or [], + "test_options": { + "max_retries": max_retries, + "record_caret": record_caret, + "log_expiration_time": int(log_expiration_time_in_days * 24 * 60 * 60), + }, + } + if integration_id: + payload["integration_id"] = integration_id + if source_job_id: + payload["source_job_id"] = str(source_job_id) + if target_name: + payload["source"] = {"git_tag" if is_tag else "git_branch": str(target_name)} + if record_caret: + payload["build_options"]["developer_option_names"] = [ + "webauto:ci:caret_enabled" + ] + + url = f"{self.api_base_url}/projects/{project_id}/jobs/schedule" + response = self.request(url, payload, method="POST") + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 202: + raise EvaluationAPIError( + f"Failed to schedule job: status={response.status_code}, body={response.text}" + ) + return json.loads(response.content) + + def schedule_job_by_test_id( + self, + test_id: str, + *, + target_name: str, + test_cases: Any = None, + max_retries: int = 1, + description: str = "no description", + clean_build: bool = False, + debug: bool = False, + release: bool = False, + record_caret: bool = False, + log_expiration_time_in_days: float = 14.0, + is_tag: bool = False, + ) -> dict[str, Any]: + if test_cases is None: + if not self.test_cases: + raise ValueError( + "No test case source provided. Pass `test_cases=...` or use schedule_job()." + ) + source = self.test_cases + else: + source = test_cases + + test_case = resolve_test_case(test_id, source) + return self.schedule_job( + project_id=test_case.project_id, + catalog_id=test_case.catalog_id, + integration_id=test_case.integration_id, + target_name=target_name, + suite_ids=test_case.suite_ids, + max_retries=max_retries, + description=description, + clean_build=clean_build, + debug=debug, + release=release, + record_caret=record_caret, + log_expiration_time_in_days=log_expiration_time_in_days, + is_tag=is_tag, + ) + + def schedule_job_by_definition( + self, + test_case: TestCaseDefinition | dict[str, Any], + *, + target_name: str, + test_id: str = "custom", + max_retries: int = 1, + description: str = "no description", + clean_build: bool = False, + debug: bool = False, + release: bool = False, + record_caret: bool = False, + log_expiration_time_in_days: float = 14.0, + is_tag: bool = False, + ) -> dict[str, Any]: + definition = normalize_test_case_definition(test_case, test_id=test_id) + return self.schedule_job( + project_id=definition.project_id, + catalog_id=definition.catalog_id, + integration_id=definition.integration_id, + target_name=target_name, + suite_ids=definition.suite_ids, + max_retries=max_retries, + description=description, + clean_build=clean_build, + debug=debug, + release=release, + record_caret=record_caret, + log_expiration_time_in_days=log_expiration_time_in_days, + is_tag=is_tag, + ) + + def get_job_status(self, project_id: str, job_id: str) -> dict[str, Any]: + """Get current job status from the API.""" + url = f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/report" + response = self.request(url, {}) + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 200: + raise EvaluationAPIError( + f"Failed to get job status: status={response.status_code}, body={response.text}" + ) + return json.loads(response.content) + + def is_job_completed(self, project_id: str, job_id: str) -> tuple[bool, str, dict[str, Any]]: + """ + Check if a job has completed (success or failure). + Returns (is_completed, status, report_data). + Status can be: 'pending', 'running', 'succeeded', 'failed', 'canceled', 'unknown' + """ + report = self.get_job_status(project_id, job_id) + + is_completed, status = get_job_completion(report) + + return is_completed, status, report + + def wait_for_job_completion( + self, + project_id: str, + job_id: str, + poll_interval: float = 60.0, + max_wait_seconds: float = 3600.0 * 24 * 7, # Default 1 week + on_progress: Optional[Callable[[str], None]] = None, + on_check: Optional[Callable[[str, float], None]] = None, + ) -> dict[str, Any]: + """ + Poll job status until completion or timeout. + + Args: + project_id: Project ID + job_id: Job ID to wait for + poll_interval: Seconds between status checks (default 60s) + max_wait_seconds: Maximum seconds to wait (default 1 week) + on_progress: Callback for progress messages (receives message string) + on_check: Callback after each check (receives status string, elapsed seconds) + + Returns: + Final job report dict + + Raises: + EvaluationAPIError: If timeout or API error + """ + start_time = time.time() + last_status = "unknown" + + if on_progress: + on_progress(f"Waiting for evaluator job {job_id} to complete...") + + while True: + elapsed = time.time() - start_time + + # Check timeout + if elapsed > max_wait_seconds: + raise EvaluationAPIError( + f"Timeout waiting for job {job_id} after {elapsed:.0f}s" + ) + + try: + is_completed, status, report = self.is_job_completed(project_id, job_id) + last_status = status + + if on_check: + on_check(status, elapsed) + + if is_completed: + if on_progress: + on_progress(f"Job {job_id} completed with status: {status}") + return report + + # Log progress periodically (every 5 minutes or on status change) + if on_progress and (elapsed < 60 or int(elapsed) % 300 < poll_interval): + on_progress( + f"Job {job_id} status: {status} (elapsed: {elapsed/3600:.1f}h)" + ) + + except Exception as e: + if on_progress: + on_progress(f"Error checking job status: {e}") + # Continue polling on transient errors + + time.sleep(poll_interval) + + def get_report_list( + self, + project_id: str, + *, + status: str = "all", + max_results: Optional[int] = None, + catalog_id: Optional[str] = None, + ) -> list[dict[str, Any]]: + reports: list[dict[str, Any]] = [] + next_token = "" + url = f"{self.api_base_url}/projects/{project_id}/jobs/reports" + while True: + params = { + "next_token": next_token, + "size": 100, + "status": status, + } + if catalog_id is not None: + params["catalog_id"] = catalog_id + + response = self.request(url, params) + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 200: + raise EvaluationAPIError( + f"Failed to fetch report list: status={response.status_code}, body={response.text}" + ) + + data = json.loads(response.content) + reports.extend(data.get("reports", [])) + next_token = data.get("next_token", "") + if next_token == "": + return reports + if max_results is not None and len(reports) >= max_results: + return reports[:max_results] + + def search_report_list( + self, + project_id: str, + *, + filters: Optional[list[dict[str, Any]]] = None, + sort: Optional[list[dict[str, Any]]] = None, + next_token: str = "", + size: int = 100, + ) -> dict[str, Any]: + url = f"{self.api_base_url}/projects/{project_id}/jobs/reports/search" + payload: dict[str, Any] = { + "size": max(1, min(int(size), 100)), + } + if next_token: + payload["next_token"] = next_token + if filters: + payload["filters"] = filters + if sort: + payload["sort"] = sort + + response = self.request(url, payload, method="POST") + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 200: + raise EvaluationAPIError( + f"Failed to search report list: status={response.status_code}, body={response.text}" + ) + return json.loads(response.content) + + def get_suite_reports(self, project_id: str, job_id: str) -> list[dict[str, Any]]: + return self._get_paginated_reports( + f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/test/suite/reports" + ) + + def get_spec_reports(self, project_id: str, job_id: str) -> list[dict[str, Any]]: + return self._get_paginated_reports( + f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/test/spec/reports" + ) + + def get_case_reports(self, project_id: str, job_id: str) -> list[dict[str, Any]]: + return self._get_paginated_reports( + f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/test/case/reports" + ) + + def get_build_reports(self, project_id: str, job_id: str) -> dict[str, Any]: + url = f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/build/reports" + response = self.request(url, {}) + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 200: + raise EvaluationAPIError( + f"Failed to fetch build reports: status={response.status_code}, body={response.text}" + ) + return json.loads(response.content) + + def get_job_report(self, project_id: str, job_id: str) -> dict[str, Any]: + url = f"{self.api_base_url}/projects/{project_id}/jobs/{job_id}/report" + response = self.request(url, {}) + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 200: + raise EvaluationAPIError( + f"Failed to fetch job report: status={response.status_code}, body={response.text}" + ) + return json.loads(response.content) + + def get_suite_summary( + self, + project_id: str, + job_id: str, + *, + use_available_case_results: bool = False, + ) -> list[dict[str, Any]]: + mode = "available_case_results" if use_available_case_results else "case_results" + summaries: list[dict[str, Any]] = [] + for suite_report in self.get_suite_reports(project_id, job_id): + if mode not in suite_report: + continue + + result = suite_report[mode] + cancellation_count = result.get("cancellation_count", 0) + summaries.append( + { + "name": suite_report["suite"]["display_name"], + "all": result["total_count"] + cancellation_count, + "success": result["success_count"], + "fail": result["failure_count"] + cancellation_count, + "cancel": cancellation_count, + "simulation": suite_report["simulation"]["name"], + "url": get_suite_report_url(project_id, job_id, suite_report["id"]), + } + ) + return summaries + + def _get_paginated_reports(self, url: str) -> list[dict[str, Any]]: + reports: list[dict[str, Any]] = [] + next_token = "" + while True: + params = { + "next_token": next_token, + "size": 100, + } + response = self.request(url, params) + if response is None: + raise EvaluationAPIError("No response returned from evaluation API") + if response.status_code != 200: + raise EvaluationAPIError( + f"Failed to fetch paginated reports: status={response.status_code}, body={response.text}" + ) + + data = json.loads(response.content) + reports.extend(data.get("reports", [])) + next_token = data.get("next_token", "") + if next_token == "": + return reports diff --git a/evaluation_dashboard_app/lib/mermaid_render.py b/evaluation_dashboard_app/lib/mermaid_render.py new file mode 100644 index 0000000..47a72d1 --- /dev/null +++ b/evaluation_dashboard_app/lib/mermaid_render.py @@ -0,0 +1,30 @@ +"""Render Mermaid diagrams in Streamlit via Mermaid.js (Streamlit markdown does not run Mermaid).""" + +import json +import uuid + +import streamlit.components.v1 as components + + +def render_mermaid(definition: str, *, height: int = 480) -> None: + """Render a Mermaid diagram inside an HTML iframe (CDN script).""" + defn_json = json.dumps(definition.strip()) + uid = uuid.uuid4().hex[:12] + html = f""" +
+ + +""" + components.html(html, height=height, scrolling=True) diff --git a/evaluation_dashboard_app/lib/overview_pdf_report.py b/evaluation_dashboard_app/lib/overview_pdf_report.py new file mode 100644 index 0000000..1547f55 --- /dev/null +++ b/evaluation_dashboard_app/lib/overview_pdf_report.py @@ -0,0 +1,2181 @@ +from __future__ import annotations + +import io +import html +from datetime import datetime +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple + +import duckdb +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go + +from lib.score_schema import ( + SCORE_BLOCK_SIZE, + SCORE_NUM_COLS, + SCORE_VIEW_METRIC_COLS, + build_score_view, + infer_score_criteria_count, + score_identity_cols, +) +from lib.summary_compare import build_summary_delta + +PRODUCT_LABEL_JA_DEFAULT = { + "Occlusion-Case": "遮蔽ケース", + "False-Positive-Grass": "草誤検知(草停止)", + "False-Positive-Ground": "地面誤検知", + "False-Positive-Splash": "水しぶき 誤検知", + "False-Positive-Exhaust-Fog": "排ガス・霧 誤検知", + "Missed-Detection-Animal": "動物ロスト(犬)", + "Missed-Detection-Falling-Object": "落下物未検知", + "Missed-Detection-Pedestrian-Child": "歩行者未検知:子供", + "Missed-Detection-Pedestrian-Umbrella": "歩行者未検知:傘", + "Missed-Detection-Pedestrian-Crouching": "歩行者未検知:しゃがむ", + "Missed-Detection-Pedestrian-Near-Structure": "歩行者未検知:構造物に近い", + "False-Positive-Truck": "トラック誤検知", + "Pose-Estimation-Yaw-Error": "Yawおかしい", + "Long-Range-Detection-Failure": "遠方見えない", + "Ghost-Object": "ミサイル", + "Sudden-Fast-Vehicle-Ghost": "高速車両の突然出現・急ブレーキ誘発", + "Misclassification-Structure-Grass-as-Pedestrian": "構造物・草を人に誤検知", + "Misclassification-Structure-Grass-as-Vehicle": "構造物・草を車両に誤検知", + "Misclassification-Bike-Motorcycle": "自転車・バイクのミスラベル", + "Missed-Detection-Unridden-Bike": "人の乗ってないバイク自転車ロスト", + "Missed-Detection-Traffic-Cone": "カラーコーンが認識できない", + "Missed-Detection-Other": "その他ロスト", +} + +_COMPARE_RUN_COLORS = ["#312e81", "#0f766e", "#e86a33", "#6b8e23", "#9b59b6", "#1abc9c"] +_OVERVIEW_COMPARE_COLORS = ["#31356E", "#008E9B", "#E86A33", "#6B8E23", "#9B59B6", "#1ABC9C"] +_CRITERIA_COLS = SCORE_VIEW_METRIC_COLS +_NUM_COLS = SCORE_NUM_COLS +_BLOCK_SIZE = SCORE_BLOCK_SIZE +_DEFAULT_MAX_EVAL_RANGE = 50 +_DISTANCE_BIN_CASE = """CASE + WHEN dist_h < 10 THEN '[0,10)' + WHEN dist_h < 20 THEN '[10,20)' + WHEN dist_h < 30 THEN '[20,30)' + WHEN dist_h < 40 THEN '[30,40)' + WHEN dist_h < 50 THEN '[40,50)' + WHEN dist_h < 60 THEN '[50,60)' + WHEN dist_h < 70 THEN '[60,70)' + WHEN dist_h < 80 THEN '[70,80)' + WHEN dist_h < 90 THEN '[80,90)' + WHEN dist_h < 100 THEN '[90,100)' + WHEN dist_h < 110 THEN '[100,110)' + WHEN dist_h < 120 THEN '[110,120)' + WHEN dist_h < 130 THEN '[120,130)' + WHEN dist_h < 140 THEN '[130,140)' + WHEN dist_h < 150 THEN '[140,150)' + ELSE '[150,inf)' +END""" + + +def make_report_filename( + run_names: Sequence[str], + *, + now: Optional[datetime] = None, + prefix: str = "overview_report", +) -> str: + ts = (now or datetime.now()).strftime("%Y%m%d_%H%M%S") + slug = _slugify(run_names[0] if run_names else "report") + return f"{prefix}_{slug}_{ts}.pdf" + + +def build_overview_pdf_report( + *, + mode: str, + run_records: Sequence[dict], + run_labels: Sequence[str], + filters: Optional[dict] = None, + product_label_map: Optional[dict] = None, + generated_at: Optional[datetime] = None, + progress_callback: Optional[Callable[[str], None]] = None, +) -> bytes: + reportlab_import_error = _ensure_reportlab_available() + if reportlab_import_error is not None: + raise RuntimeError(reportlab_import_error) + + from reportlab.lib import colors + from reportlab.lib.enums import TA_LEFT + from reportlab.lib.pagesizes import A4 + from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet + from reportlab.lib.units import inch + from reportlab.lib.utils import ImageReader + from reportlab.platypus import ( + Image, + PageBreak, + Paragraph, + SimpleDocTemplate, + Spacer, + Table, + TableStyle, + ) + + product_label_map = product_label_map or PRODUCT_LABEL_JA_DEFAULT + generated_at = generated_at or datetime.now() + filters = filters or {} + + def _notify(message: str) -> None: + if progress_callback is not None: + progress_callback(message) + + styles = getSampleStyleSheet() + title_style = ParagraphStyle( + "ReportTitle", + parent=styles["Title"], + fontSize=22, + leading=28, + alignment=TA_LEFT, + textColor=colors.HexColor("#0f172a"), + ) + section_style = ParagraphStyle( + "SectionHeader", + parent=styles["Heading1"], + fontSize=16, + leading=21, + spaceAfter=8, + textColor=colors.HexColor("#0f172a"), + ) + body_style = ParagraphStyle( + "Body", + parent=styles["BodyText"], + fontSize=10.5, + leading=14, + textColor=colors.HexColor("#334155"), + ) + caption_style = ParagraphStyle( + "Caption", + parent=styles["BodyText"], + fontSize=9, + leading=12, + textColor=colors.HexColor("#475569"), + ) + + buffer = io.BytesIO() + doc = SimpleDocTemplate( + buffer, + pagesize=A4, + rightMargin=0.55 * inch, + leftMargin=0.55 * inch, + topMargin=0.55 * inch, + bottomMargin=0.55 * inch, + title="Overview PDF Report", + ) + content_width = doc.width + story: List[Any] = [] + _notify("Preparing cover and active filter summary") + + run_names = [Path(str(r.get("path", ""))).name or f"Run {lbl}" for r, lbl in zip(run_records, run_labels)] + story.extend( + [ + Paragraph("Evaluation Dashboard Report", title_style), + Spacer(1, 8), + Paragraph( + f"Generated {generated_at.strftime('%Y-%m-%d %H:%M:%S')} · " + f"{'Compare mode' if mode == 'Compare Mode' else 'Single mode'}", + body_style, + ), + Spacer(1, 8), + _styled_table( + [["Run", "Label", "Directory"]] + [ + [f"Run {lbl}", name, str(record.get("path", ""))] + for record, lbl, name in zip(run_records, run_labels, run_names) + ], + content_width, + ), + Spacer(1, 12), + Paragraph( + f"Perception labels: {_summarize_filter_values(filters.get('perception_labels'))}
" + f"Product labels: {_summarize_filter_values(filters.get('product_labels'))}", + body_style, + ), + Spacer(1, 16), + ] + ) + + _notify("Building Overview section") + overview_section = _build_overview_section(run_records, run_labels, product_label_map) + _notify("Building TP Summary section") + tp_section = _build_tp_summary_section(run_records, run_labels, product_label_map) + _notify("Building Criteria Based Score section") + criteria_section = _build_criteria_section(run_records, run_labels) + _notify("Building Detection Stats section") + detection_section = _build_detection_section(run_records, run_labels) + + sections = [ + ("Overview", overview_section), + ("TP Summary", tp_section), + ("Criteria Based Score", criteria_section), + ("Detection Stats", detection_section), + ] + + available_sections = 0 + for idx, (title, payload) in enumerate(sections): + story.append(Paragraph(title, section_style)) + story.append(Paragraph(payload["summary"], body_style)) + story.append(Spacer(1, 8)) + if payload.get("flowables"): + for flowable in payload["flowables"]: + story.append(flowable) + story.append(Spacer(1, 8)) + if payload.get("tables"): + for table in payload["tables"]: + story.append(_styled_table(table, content_width)) + story.append(Spacer(1, 8)) + figs = payload.get("figures", []) + if figs: + exported_any_fig = False + for fig, caption in figs: + try: + story.append(_plotly_figure_to_image(fig, content_width, ImageReader)) + story.append(Spacer(1, 4)) + story.append(Paragraph(caption, caption_style)) + exported_any_fig = True + except Exception as exc: + story.append( + Paragraph( + f"Chart export unavailable for this figure: {str(exc)}", + caption_style, + ) + ) + story.append(Spacer(1, 12)) + if exported_any_fig: + available_sections += 1 + else: + if payload.get("tables"): + available_sections += 1 + story.append(Paragraph(payload.get("fallback_note", "Section unavailable."), caption_style)) + story.append(Spacer(1, 12)) + if idx != len(sections) - 1: + story.append(PageBreak()) + + if available_sections == 0: + story.append( + Paragraph( + "No report sections were available for export. Check Summary.csv, Score.csv, and parquet data for the selected run(s).", + body_style, + ) + ) + + def _draw_page_number(canvas, document): + canvas.setFont("Helvetica", 9) + canvas.setFillColor(colors.HexColor("#64748b")) + canvas.drawRightString(document.pagesize[0] - document.rightMargin, 18, f"Page {document.page}") + + _notify("Assembling PDF pages") + doc.build(story, onFirstPage=_draw_page_number, onLaterPages=_draw_page_number) + _notify("Finalizing PDF bytes") + return buffer.getvalue() + + +def _build_tp_summary_section( + run_records: Sequence[dict], + run_labels: Sequence[str], + product_label_map: dict, +) -> dict: + available = [r for r in run_records if r.get("summary") is not None] + if not available: + return { + "summary": "Summary.csv is not available for the selected run set.", + "figures": [], + "tables": [], + "fallback_note": "TP Summary skipped because Summary.csv is missing.", + } + + summaries = [r["summary"] for r in run_records if r.get("summary") is not None] + labels = [run_labels[i] for i, r in enumerate(run_records) if r.get("summary") is not None] + figures: List[Tuple[go.Figure, str]] = [] + tables: List[list[list[str]]] = [] + + metrics_table = [["Run", "Rows", "TP mean", "XRMS mean", "YRMS mean", "XSTD mean", "YSTD mean"]] + for lbl, df in zip(labels, summaries): + metrics_table.append( + [ + lbl, + f"{len(df):,}", + _fmt_number(df["TP"].mean()), + _fmt_number(df["xrms"].mean()), + _fmt_number(df["yrms"].mean()), + _fmt_number(df["xstd"].mean()), + _fmt_number(df["ystd"].mean()), + ] + ) + tables.append(metrics_table) + + if len(summaries) >= 2: + baseline_lbl = labels[0] + for cand_idx in range(1, len(summaries)): + cand_lbl = labels[cand_idx] + delta_df = build_summary_delta(summaries[0], summaries[cand_idx]) + if delta_df.empty: + figures.append( + ( + _make_text_placeholder_figure( + f"No overlapping Summary rows for delta ({cand_lbl} vs {baseline_lbl})." + ), + f"Delta view is empty because baseline {baseline_lbl} and candidate {cand_lbl} do not share Summary keys.", + ) + ) + else: + figures.extend(_build_tp_default_compare_figures(delta_df, cand_lbl)) + else: + figures.extend(_build_tp_default_single_figures(summaries[0])) + + return { + "summary": "This section follows the default TP Summary page view as closely as possible using the current Overview-selected runs and filters.", + "figures": figures, + "tables": tables, + "fallback_note": "No TP Summary figures were available after filtering.", + } + + +def _build_overview_section( + run_records: Sequence[dict], + run_labels: Sequence[str], + product_label_map: dict, +) -> dict: + summary_runs = [(run_labels[i], r["summary"]) for i, r in enumerate(run_records) if r.get("summary") is not None] + if not summary_runs: + return { + "summary": "Overview metrics are unavailable because Summary.csv is missing for the selected run set.", + "figures": [], + "tables": [], + "fallback_note": "Overview section skipped because Summary.csv is missing.", + } + + tables: List[list[list[str]]] = [] + figures: List[Tuple[go.Figure, str]] = [] + flowables: List[Any] = [] + + metric_card_rows = [["Run", "TP mean", "XRMS", "YRMS", "XSTD", "YSTD"]] + for lbl, df in summary_runs: + metric_card_rows.append( + [ + lbl, + _fmt_number(df["TP"].mean()), + _fmt_number(df["xrms"].mean()), + _fmt_number(df["yrms"].mean()), + _fmt_number(df["xstd"].mean()), + _fmt_number(df["ystd"].mean()), + ] + ) + tables.append(metric_card_rows) + flowables.extend(_build_overview_metric_cards(summary_runs)) + + summaries = [df for _, df in summary_runs] + labels = [lbl for lbl, _ in summary_runs] + fig_perception = _build_tp_mean_by_label_compare_figure(summaries, labels, "perception_label") + if fig_perception is not None: + figures.append((fig_perception, "Overview page result: TP mean by Perception Label.")) + fig_product = _build_tp_mean_by_label_compare_figure( + summaries, + labels, + "product_label", + label_jp_map=product_label_map, + ) + if fig_product is not None: + figures.append((fig_product, "Overview page result: TP mean by Product Label.")) + + return { + "summary": "This section mirrors the Overview page first: summary metrics and TP mean by label using the current Overview run selection and label filters.", + "flowables": flowables, + "figures": figures, + "tables": tables, + "fallback_note": "Overview figures were unavailable after filtering.", + } + + +def _build_overview_metric_cards(summary_runs: Sequence[Tuple[str, pd.DataFrame]]) -> List[Any]: + from reportlab.lib import colors + from reportlab.platypus import Table, TableStyle + + card_cells: List[Any] = [] + for idx, (lbl, df) in enumerate(summary_runs): + accent = _compare_color(idx) + rows = [ + [f"Run {lbl}"], + [f"TP mean {_fmt_number(df['TP'].mean())}"], + [f"XRMS {_fmt_number(df['xrms'].mean())} YRMS {_fmt_number(df['yrms'].mean())}"], + [f"XSTD {_fmt_number(df['xstd'].mean())} YSTD {_fmt_number(df['ystd'].mean())}"], + ] + t = Table(rows, colWidths=[220]) + t.setStyle( + TableStyle( + [ + ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor(accent)), + ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), + ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), + ("FONTSIZE", (0, 0), (-1, 0), 11), + ("BACKGROUND", (0, 1), (-1, -1), colors.HexColor("#f8fafc")), + ("TEXTCOLOR", (0, 1), (-1, -1), colors.HexColor("#0f172a")), + ("FONTNAME", (0, 1), (-1, -1), "Helvetica-Bold"), + ("FONTSIZE", (0, 1), (-1, -1), 10), + ("BOX", (0, 0), (-1, -1), 0.8, colors.HexColor("#cbd5e1")), + ("ROUNDEDCORNERS", [10, 10, 10, 10]), + ("LEFTPADDING", (0, 0), (-1, -1), 10), + ("RIGHTPADDING", (0, 0), (-1, -1), 10), + ("TOPPADDING", (0, 0), (-1, -1), 8), + ("BOTTOMPADDING", (0, 0), (-1, -1), 8), + ] + ) + ) + card_cells.append(t) + + if not card_cells: + return [] + + cards_per_row = 2 + grid_rows: List[List[Any]] = [] + for start in range(0, len(card_cells), cards_per_row): + row = card_cells[start : start + cards_per_row] + if len(row) < cards_per_row: + row = row + ["" for _ in range(cards_per_row - len(row))] + grid_rows.append(row) + + grid = Table(grid_rows, colWidths=[260, 260], hAlign="LEFT") + grid.setStyle( + TableStyle( + [ + ("VALIGN", (0, 0), (-1, -1), "TOP"), + ("LEFTPADDING", (0, 0), (-1, -1), 0), + ("RIGHTPADDING", (0, 0), (-1, -1), 12), + ("TOPPADDING", (0, 0), (-1, -1), 0), + ("BOTTOMPADDING", (0, 0), (-1, -1), 0), + ] + ) + ) + return [grid] + + +def _build_criteria_section(run_records: Sequence[dict], run_labels: Sequence[str]) -> dict: + score_runs = [(run_labels[i], r) for i, r in enumerate(run_records) if r.get("score") is not None] + if not score_runs: + return { + "summary": "Score.csv is not available for the selected run set.", + "figures": [], + "tables": [], + "fallback_note": "Criteria section skipped because Score.csv is missing.", + } + + criteria_count = min(infer_score_criteria_count(rec["score"]) for _, rec in score_runs) + if criteria_count <= 0: + return { + "summary": "Score.csv was loaded, but no criteria blocks were detected.", + "figures": [], + "tables": [], + "fallback_note": "Criteria section skipped because no criteria blocks were found.", + } + + criteria_idx = 0 + views: List[Tuple[str, pd.DataFrame]] = [] + for lbl, rec in score_runs: + df_view = _build_score_view(rec["score"], criteria_idx) + if not df_view.empty: + df_view["Run"] = lbl + views.append((lbl, df_view)) + + if not views: + return { + "summary": "Criteria data was present but could not be shaped into a report view.", + "figures": [], + "tables": [], + "fallback_note": "Criteria section skipped because the selected rows were empty.", + } + + combined = pd.concat([df for _, df in views], ignore_index=True) + tables = [ + [["Run", "Rows", "Pass rate mean", "Pass rate median", "NM mean"]] + + [ + [ + lbl, + f"{len(df):,}", + _fmt_number(df["pass_rate"].mean()), + _fmt_number(df["pass_rate"].median()), + _fmt_number(df["nm"].mean()), + ] + for lbl, df in views + ] + ] + + figures: List[Tuple[go.Figure, str]] = [] + if len(views) >= 2: + figures.extend(_build_criteria_default_compare_figures(views)) + scenario_table = _build_criteria_compare_table(views) + else: + figures.extend(_build_criteria_default_single_figures(views[0][1])) + scenario_table = _build_criteria_single_table(views[0][1]) + if scenario_table: + tables.append(scenario_table) + + return { + "summary": "This section follows the default Criteria Based Score page setup: criteria0, metric=pass_rate, and group_by=GT_OBJ.", + "figures": figures, + "tables": tables, + "fallback_note": "Criteria charts were unavailable for the selected run set.", + } + + +def _build_detection_section(run_records: Sequence[dict], run_labels: Sequence[str]) -> dict: + parquet_paths: List[Tuple[str, str]] = [] + for rec, lbl in zip(run_records, run_labels): + files = sorted(Path(rec["path"]).glob("*.parquet")) + if files: + parquet_paths.append((lbl, str(files[0]))) + + if not parquet_paths: + return { + "summary": "No parquet files were found in the selected run set.", + "figures": [], + "tables": [], + "fallback_note": "Detection Stats skipped because parquet data is missing.", + } + + con = duckdb.connect() + views: List[Tuple[str, str]] = [] + try: + for idx, (lbl, pq) in enumerate(parquet_paths): + view_name = "pdf_eval_flat" if idx == 0 else f"pdf_eval_flat_{idx}" + _create_eval_flat_view(con, pq, view_name) + views.append((lbl, view_name)) + + tables = [[["Run", "TP", "FP", "FN", "TPR", "Precision", "F1"]]] + figures: List[Tuple[go.Figure, str]] = [] + + kpi_rows = [["Run", "TP", "FP", "FN", "TPR", "Precision", "F1"]] + for lbl, view in views: + kpi = _kpi_row_for_view(con, view) + if kpi is None: + continue + kpi_rows.append( + [ + lbl, + f"{kpi['tp']:,}", + f"{kpi['fp']:,}", + f"{kpi['fn']:,}", + _fmt_percent(kpi["tpr"]), + _fmt_percent(kpi["precision"]), + _fmt_percent(kpi["f1"]), + ] + ) + tables = [kpi_rows] if len(kpi_rows) > 1 else [] + + dataset_rows = [["Run", "Distinct datasets"]] + for lbl, view in views: + n_ds = con.execute(f"SELECT COUNT(DISTINCT t4dataset_id) FROM {view}").fetchone()[0] + dataset_rows.append([lbl, f"{int(n_ds or 0):,}"]) + if len(dataset_rows) > 1: + tables.append(dataset_rows) + + df_status = _query_status_counts(con, views) + if not df_status.empty: + fig_status = _build_detection_status_figure(df_status) + _apply_detection_theme(fig_status, "Detection status distribution by label") + figures.append((fig_status, "Stacked TP/FP/FN counts per label from the first parquet file in each selected run.")) + + figures.extend(_build_detection_distance_figures(con, views)) + figures.extend(_build_detection_tpr_figures(con, views)) + figures.extend(_build_detection_mean_error_figures(con, views)) + figures.extend(_build_detection_perception_diff_figures(con, views)) + + return { + "summary": ( + "This section follows the default Detection Stats view as closely as possible: " + "summary KPIs, status distribution, distance panels, TP rate, mean error, and compare-mode perception diff." + ), + "figures": figures, + "tables": tables, + "fallback_note": "Detection charts were unavailable for the selected parquet data.", + } + finally: + con.close() + + +def _build_tp_mean_by_label_compare_figure( + df_list: Sequence[pd.DataFrame], + run_labels: Sequence[str], + label_col: str, + *, + label_jp_map: Optional[dict] = None, +) -> Optional[go.Figure]: + if not df_list or not run_labels or label_col not in df_list[0].columns: + return None + all_labels = set() + groups = [] + for df in df_list: + if label_col not in df.columns: + return None + xdf = df[df[label_col].notna() & (df[label_col].astype(str).str.strip() != "")] + g = xdf.groupby(label_col)["TP"].mean() if not xdf.empty else pd.Series(dtype=float) + groups.append(g) + all_labels.update(g.index) + if not all_labels: + return None + all_labels = sorted(all_labels) + labels_disp = [label_jp_map.get(l, l) for l in all_labels] if label_jp_map else all_labels + traces = [] + for idx, (g, lbl) in enumerate(zip(groups, run_labels)): + vals = [g.get(label, float("nan")) for label in all_labels] + traces.append( + go.Bar( + name=lbl, + x=labels_disp, + y=vals, + marker=dict(color=_OVERVIEW_COMPARE_COLORS[idx % len(_OVERVIEW_COMPARE_COLORS)]), + text=[f"{x:.2f}" if pd.notna(x) else "N/A" for x in vals], + textposition="auto", + ) + ) + fig = go.Figure(traces) + fig.update_layout( + title=f"TP mean by {label_col.replace('_', ' ')}", + barmode="group", + xaxis_title=label_col.replace("_", " ").title(), + yaxis_title="TP mean", + height=420, + margin=dict(t=70, b=55, l=55, r=25), + legend_title="Run", + template="plotly_white", + ) + return fig + + +def _build_tp_default_single_figures(df: pd.DataFrame) -> List[Tuple[go.Figure, str]]: + df_f = df.copy() + for column in ("vx", "vy"): + if column in df_f.columns and not df_f.empty: + q1, q99 = df_f[column].quantile([0.01, 0.99]).values + df_f[column] = df_f[column].clip(q1, q99) + figures: List[Tuple[go.Figure, str]] = [] + fig_rms = px.scatter( + df_f, + x="xrms", + y="yrms", + color="TP", + hover_data=["id"], + labels={"xrms": "X RMS", "yrms": "Y RMS", "TP": "TP"}, + color_continuous_scale="Viridis", + ) + fig_rms.update_traces(marker=dict(size=8, opacity=0.7)) + _apply_tp_clean_theme(fig_rms) + figures.append((fig_rms, "Default TP Summary RMS scatter from the selected Summary.csv rows.")) + + fig_vel = px.scatter( + df_f, + x="vx", + y="vy", + color="TP", + hover_data=["id"], + labels={"vx": "Vx", "vy": "Vy", "TP": "TP"}, + color_continuous_scale="Plasma", + title="Vx vs Vy", + ) + _apply_tp_clean_theme(fig_vel) + figures.append((fig_vel, "Default TP Summary velocity scatter with outlier clipping enabled.")) + + figures.append((_build_tp_distribution_figure(df_f, "TP"), "Default TP distribution view (metric = TP).")) + figures.append((_build_tp_violin_figure(df_f, "TP"), "Default TP density violin for metric = TP.")) + return figures + + +def _build_tp_default_compare_figures(df_delta: pd.DataFrame, candidate_label: str) -> List[Tuple[go.Figure, str]]: + figures: List[Tuple[go.Figure, str]] = [] + tp_col = "TP_delta" + fig_rms_x = px.scatter( + df_delta, + x="xrms_B", + y="xrms", + color=tp_col, + hover_data=["id", "xrms_delta", "yrms_delta"], + labels={ + "xrms_B": f"X RMS ({candidate_label})", + "xrms": "X RMS (A)", + tp_col: "Delta TP", + "xrms_delta": "Delta X RMS", + "yrms_delta": "Delta Y RMS", + }, + title=f"Scatter: X RMS ({candidate_label}) vs X RMS (A)", + color_continuous_scale="Viridis", + ) + fig_rms_x.update_traces(marker=dict(size=8, opacity=0.6)) + _apply_tp_clean_theme(fig_rms_x) + figures.append( + (fig_rms_x, f"TP Summary compare ({candidate_label} vs baseline): X RMS scatter, colored by TP delta.") + ) + + fig_rms_y = px.scatter( + df_delta, + x="yrms_B", + y="yrms", + color=tp_col, + hover_data=["id", "xrms_delta", "yrms_delta"], + labels={ + "yrms_B": f"Y RMS ({candidate_label})", + "yrms": "Y RMS (A)", + tp_col: "Delta TP", + "xrms_delta": "Delta X RMS", + "yrms_delta": "Delta Y RMS", + }, + title=f"Scatter: Y RMS ({candidate_label}) vs Y RMS (A)", + color_continuous_scale="Viridis", + ) + fig_rms_y.update_traces(marker=dict(size=8, opacity=0.6)) + _apply_tp_clean_theme(fig_rms_y) + figures.append( + (fig_rms_y, f"TP Summary compare ({candidate_label} vs baseline): Y RMS scatter, colored by TP delta.") + ) + + figures.append( + ( + _build_tp_distribution_figure(df_delta, "TP_delta"), + f"TP Summary compare ({candidate_label} vs baseline): TP delta distribution.", + ) + ) + figures.append( + ( + _build_tp_violin_figure(df_delta, "TP_delta"), + f"TP Summary compare ({candidate_label} vs baseline): TP delta violin.", + ) + ) + return figures + + +def _build_tp_distribution_figure(df: pd.DataFrame, metric: str) -> go.Figure: + fig = px.histogram( + df, + x=metric, + nbins=40, + color_discrete_sequence=["#0d9488"], + marginal="box", + opacity=0.88, + ) + fig.update_layout( + template="plotly_white", + showlegend=False, + bargap=0.04, + xaxis_title=metric, + yaxis_title="Count", + paper_bgcolor="rgba(248,250,252,0.9)", + plot_bgcolor="rgba(255,255,255,0.95)", + font=dict(family="system-ui, sans-serif", size=12, color="#334155"), + margin=dict(t=36, b=48, l=56, r=28), + ) + return fig + + +def _build_tp_violin_figure(df: pd.DataFrame, metric: str) -> go.Figure: + fig = px.violin( + df, + y=metric, + box=True, + points="all", + color_discrete_sequence=["#312e81"], + ) + fig.update_layout( + template="plotly_white", + yaxis_title=metric, + showlegend=False, + paper_bgcolor="rgba(248,250,252,0.9)", + plot_bgcolor="rgba(255,255,255,0.95)", + font=dict(family="system-ui, sans-serif", size=12, color="#334155"), + margin=dict(t=36, b=48, l=56, r=28), + ) + return fig + + +def _apply_tp_clean_theme(fig: go.Figure) -> None: + fig.update_layout( + template="plotly_white", + paper_bgcolor="rgba(248,250,252,0.9)", + plot_bgcolor="rgba(255,255,255,0.95)", + font=dict(family="system-ui, sans-serif", size=12, color="#334155"), + margin=dict(t=48, b=48, l=56, r=28), + ) + + +def _build_criteria_default_single_figures(df_view: pd.DataFrame) -> List[Tuple[go.Figure, str]]: + figures: List[Tuple[go.Figure, str]] = [] + metric = "pass_rate" + group_by = "GT_OBJ" if df_view["GT_OBJ"].notna().any() else "Option" + fig_hist = px.histogram( + df_view, + x=metric, + color=group_by, + nbins=30, + marginal="box", + color_discrete_sequence=px.colors.qualitative.Bold, + ) + _apply_criteria_theme(fig_hist, f"{metric} · histogram") + figures.append((fig_hist, "Default Criteria page distribution chart for criteria0 and metric = pass_rate.")) + + df_avg = df_view.groupby(group_by, as_index=False)[metric].mean().sort_values(metric, ascending=False) + fig_bar = px.bar( + df_avg, + x=group_by, + y=metric, + text_auto=".2f", + color=group_by, + color_discrete_sequence=px.colors.qualitative.Bold, + ) + _apply_criteria_theme(fig_bar, f"Mean {metric}") + fig_bar.update_layout(showlegend=False) + figures.append((fig_bar, f"Default grouped mean chart by {group_by}.")) + + fig_box = px.box( + df_view, + x=group_by, + y="pass_rate", + points="all", + color=group_by, + color_discrete_sequence=px.colors.qualitative.Bold, + ) + _apply_criteria_theme(fig_box, "Pass rate by group") + fig_box.update_layout(showlegend=False) + figures.append((fig_box, f"Default pass-rate overview by {group_by}.")) + return figures + + +def _build_criteria_default_compare_figures(views: Sequence[Tuple[str, pd.DataFrame]]) -> List[Tuple[go.Figure, str]]: + figures: List[Tuple[go.Figure, str]] = [] + metric = "pass_rate" + group_by = "GT_OBJ" + run_order = [lbl for lbl, _ in views] + combined = pd.concat([df.assign(Run=lbl) for lbl, df in views], ignore_index=True) + combined["Run"] = pd.Categorical(combined["Run"], categories=run_order, ordered=True) + px_map = {lbl: _COMPARE_RUN_COLORS[i % len(_COMPARE_RUN_COLORS)] for i, (lbl, _) in enumerate(views)} + + fig_hist = px.histogram( + combined, + x=metric, + color="Run", + color_discrete_map=px_map, + category_orders={"Run": run_order}, + nbins=30, + barmode="overlay", + opacity=0.55, + marginal="box", + ) + _apply_criteria_theme(fig_hist, f"{metric} · row-level distribution") + figures.append((fig_hist, "Default compare overlay view for pass-rate distribution.")) + + df_avg = combined.groupby([group_by, "Run"], as_index=False)[metric].mean() + obj_means = df_avg.groupby(group_by, as_index=False)[metric].mean().sort_values(metric, ascending=False) + obj_order = [x for x in obj_means[group_by].tolist() if x in set(df_avg[group_by])] + df_avg[group_by] = pd.Categorical(df_avg[group_by], categories=obj_order, ordered=True) + df_avg = df_avg.sort_values([group_by, "Run"]) + fig_bar = px.bar( + df_avg, + x=group_by, + y=metric, + color="Run", + color_discrete_map=px_map, + category_orders={group_by: obj_order, "Run": run_order}, + barmode="group", + text_auto=".2f", + ) + _apply_criteria_theme(fig_bar, f"Mean {metric} by {group_by}") + figures.append((fig_bar, f"Default compare grouped mean view by {group_by}.")) + + fig_box = px.box( + combined, + x=group_by, + y="pass_rate", + color="Run", + color_discrete_map=px_map, + category_orders={group_by: obj_order, "Run": run_order}, + points="all", + ) + _apply_criteria_theme(fig_box, "Pass rate overview") + figures.append((fig_box, f"Default compare pass-rate overview by {group_by}.")) + + scenario_delta = _build_criteria_compare_delta_figure(views) + if scenario_delta is not None: + base_l = run_order[0] + if len(run_order) == 2: + cap = f"Default compare per-scenario delta view for candidate {run_order[1]} vs baseline {base_l}." + else: + rest = ", ".join(run_order[1:]) + cap = ( + f"Default compare per-scenario delta vs baseline {base_l} " + f"for candidates {rest} (grouped bars)." + ) + figures.append((scenario_delta, cap)) + return figures + + +def _build_criteria_single_table(df_view: pd.DataFrame) -> List[List[str]]: + key_cols = score_identity_cols(df_view) + scenario_metric = df_view.groupby(key_cols, as_index=False)["pass_rate"].mean().sort_values("pass_rate", ascending=False).head(20) + rows = [key_cols + ["Pass rate mean"]] + for _, row in scenario_metric.iterrows(): + rows.append([_shorten_scenario_name(str(row[c])) for c in key_cols] + [_fmt_number(row["pass_rate"])]) + first_w = 0.56 if len(key_cols) > 1 else 0.72 + rest_w = (1.0 - first_w) / len(key_cols) + return {"rows": rows, "col_width_weights": [first_w] + [rest_w] * len(key_cols)} + + +def _build_criteria_compare_table(views: Sequence[Tuple[str, pd.DataFrame]]) -> List[List[str]]: + labels = [lbl for lbl, _ in views] + key_cols = score_identity_cols(views[0][1]) + merges = [] + for lbl, df in views: + g = df.groupby(key_cols, as_index=False)["pass_rate"].mean() + merges.append(g.rename(columns={"pass_rate": f"pr_{lbl}"})) + per_scenario = merges[0] + for g in merges[1:]: + per_scenario = per_scenario.merge(g, on=key_cols, how="inner") + base = labels[0] + delta_cols: List[str] = [] + for cand in labels[1:]: + dcol = f"delta_{cand}" + per_scenario[dcol] = per_scenario[f"pr_{cand}"] - per_scenario[f"pr_{base}"] + delta_cols.append(dcol) + rank_key = per_scenario[delta_cols].abs().max(axis=1) + per_scenario = per_scenario.reindex(rank_key.sort_values(ascending=False).index).head(20) + header: List[str] = key_cols + [f"Pass rate ({base})"] + for cand in labels[1:]: + header.extend([f"Pass rate ({cand})", f"Δ({cand} - {base})"]) + rows = [header] + for _, row in per_scenario.iterrows(): + cells: List[str] = [_shorten_scenario_name(str(row[c])) for c in key_cols] + [_fmt_number(row[f"pr_{base}"])] + for cand in labels[1:]: + cells.extend([_fmt_number(row[f"pr_{cand}"]), _fmt_number(row[f"delta_{cand}"])]) + rows.append(cells) + ncols = len(header) + scen_w = 0.28 if ncols > 5 else 0.44 + rest_w = (1.0 - scen_w) / max(ncols - 1, 1) + weights = [scen_w] + [rest_w] * (ncols - 1) + return {"rows": rows, "col_width_weights": weights} + + +def _build_criteria_compare_delta_figure(views: Sequence[Tuple[str, pd.DataFrame]]) -> Optional[go.Figure]: + if len(views) < 2: + return None + labels = [lbl for lbl, _ in views] + base = labels[0] + key_cols = score_identity_cols(views[0][1]) + merges = [] + for lbl, df in views: + g = df.groupby(key_cols, as_index=False)["pass_rate"].mean() + merges.append(g.rename(columns={"pass_rate": f"pr_{lbl}"})) + per_scenario = merges[0] + for g in merges[1:]: + per_scenario = per_scenario.merge(g, on=key_cols, how="inner") + if per_scenario.empty: + return None + long_rows: List[dict] = [] + delta_cols: List[str] = [] + for cand in labels[1:]: + dcol = f"delta_{cand}" + per_scenario[dcol] = per_scenario[f"pr_{cand}"] - per_scenario[f"pr_{base}"] + delta_cols.append(dcol) + rank_key = per_scenario[delta_cols].abs().max(axis=1) + vis = per_scenario.reindex(rank_key.sort_values(ascending=False).index).head(20) + if "Dataset" in key_cols: + scenario_labels = vis["Scenario"].astype(str) + " [" + vis["Dataset"].astype(str) + "]" + else: + scenario_labels = vis["Scenario"].astype(str) + scen_order = [_shorten_scenario_name(str(s)) for s in scenario_labels.tolist()] + for _, row in vis.iterrows(): + scen_raw = f"{row['Scenario']} [{row['Dataset']}]" if "Dataset" in key_cols else row["Scenario"] + scen_disp = _shorten_scenario_name(str(scen_raw)) + for cand in labels[1:]: + long_rows.append( + { + "Scenario": scen_disp, + "vs_baseline": f"Δ({cand} - {base})", + "delta": float(row[f"delta_{cand}"]), + } + ) + melted = pd.DataFrame(long_rows) + if melted.empty: + return None + legend_order = [f"Δ({cand} - {base})" for cand in labels[1:]] + color_map = { + leg: _COMPARE_RUN_COLORS[(i + 1) % len(_COMPARE_RUN_COLORS)] + for i, leg in enumerate(legend_order) + } + fig = px.bar( + melted, + x="Scenario", + y="delta", + color="vs_baseline", + color_discrete_map=color_map, + category_orders={"Scenario": scen_order, "vs_baseline": legend_order}, + barmode="group", + text_auto=".2f", + ) + fig.update_layout(coloraxis_showscale=False, legend_title_text="") + _apply_criteria_theme(fig, "Pass rate delta by scenario") + return fig + + +def _build_detection_status_figure(df_status: pd.DataFrame) -> go.Figure: + status_colors = {"TP": "#2d8f47", "FN": "#d73027", "FP": "#E86A33", "TN": "#4A90D9"} + if "run" in df_status.columns and df_status["run"].nunique() > 1: + fig = px.bar( + df_status, + x="label", + y="num", + color="status", + barmode="stack", + facet_col="run", + color_discrete_map=status_colors, + title="Status Distribution per Label", + labels={"num": "Count", "label": "Label", "status": "Status"}, + ) + fig.for_each_annotation(lambda ann: ann.update(text=ann.text.replace("run=", ""))) + return fig + if df_status["label"].nunique() > 6: + return px.bar( + df_status, + y="label", + x="num", + color="status", + barmode="stack", + title="Status Distribution per Label", + labels={"num": "Count", "label": "Label", "status": "Status"}, + color_discrete_map=status_colors, + orientation="h", + ) + return px.bar( + df_status, + x="label", + y="num", + color="status", + barmode="stack", + title="Status Distribution per Label", + labels={"num": "Count", "label": "Label", "status": "Status"}, + color_discrete_map=status_colors, + ) + + +def _build_detection_distance_figures( + con: duckdb.DuckDBPyConnection, + views: Sequence[Tuple[str, str]], +) -> List[Tuple[go.Figure, str]]: + figures: List[Tuple[go.Figure, str]] = [] + labels = [lbl for lbl, _ in views] + if len(views) == 1: + df_both = _query_distance_rates_single(con, views[0][1]) + if not df_both.empty: + fig = go.Figure() + fig.add_trace( + go.Bar( + x=df_both["bin_label"], + y=df_both["tpr"], + name="TP rate", + marker_color=_COMPARE_RUN_COLORS[0], + hovertemplate="%{x}
TP rate: %{y:.2%}", + ) + ) + fig.add_trace( + go.Bar( + x=df_both["bin_label"], + y=df_both["fpr"], + name="FP rate", + marker_color=_COMPARE_RUN_COLORS[2], + hovertemplate="%{x}
FP rate: %{y:.2%}", + ) + ) + _apply_detection_theme(fig, "TP & FP rate by distance") + fig.update_layout( + xaxis_title="Distance bin", + yaxis_title="Rate", + yaxis_range=[0, 1], + barmode="group", + xaxis=dict(tickangle=-35, categoryorder="array", categoryarray=df_both["bin_label"].tolist()), + hovermode="x unified", + ) + fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + figures.append((fig, "Detection Stats distance panel in bar-chart mode across the full 0-150+ range.")) + df_oc = _query_object_counts_single(con, views[0][1]) + if not df_oc.empty: + align_x = sorted(df_oc["bin_label"].unique(), key=_distance_bin_sort_key) + pivot_oc = df_oc.pivot_table(index="bin_label", columns="label", values="n", aggfunc="sum", fill_value=0).reindex(align_x, fill_value=0) + fig_oc = go.Figure() + for j, lab in enumerate(pivot_oc.columns): + c = _compare_color(j) + fig_oc.add_trace( + go.Bar( + x=align_x, + y=pivot_oc[lab].values, + name=str(lab), + marker_color=c, + hovertemplate=f"{lab}
%{{x}}
Count: %{{y:.0f}}", + ) + ) + _apply_detection_theme(fig_oc, "Object count by distance bin") + fig_oc.update_layout( + xaxis_title="Distance bin", + yaxis_title="Count", + barmode="group", + xaxis=dict(tickangle=-35, categoryorder="array", categoryarray=align_x), + hovermode="x unified", + ) + figures.append((fig_oc, "Detection Stats object-count-by-distance panel in bar-chart mode across the full 0-150+ range.")) + return figures + + df_tpr = _query_distance_rates_compare(con, views, metric="tpr") + if not df_tpr.empty: + fig_tpr = go.Figure() + for i, lbl in enumerate(labels): + d = df_tpr[df_tpr["run"] == lbl].sort_values("bin_order") + c = _compare_color(i) + fig_tpr.add_trace( + go.Bar( + x=d["bin_label"], + y=d["tpr"], + name=lbl, + marker_color=c, + hovertemplate=f"{lbl}
%{{x}}
TP rate: %{{y:.2%}}", + ) + ) + align_x = df_tpr[df_tpr["run"] == labels[0]].sort_values("bin_order")["bin_label"].tolist() + _apply_detection_theme(fig_tpr, "TP rate by distance") + fig_tpr.update_layout( + xaxis_title="Distance bin", + yaxis_title="TP rate", + yaxis_range=[0, 1], + barmode="group", + xaxis=dict(tickangle=-35, categoryorder="array", categoryarray=align_x), + hovermode="x unified", + ) + fig_tpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + figures.append((fig_tpr, "Detection Stats compare distance panel in bar-chart mode: TP rate by distance.")) + + df_fpr = _query_distance_rates_compare(con, views, metric="fpr") + if not df_fpr.empty: + fig_fpr = go.Figure() + for i, lbl in enumerate(labels): + d = df_fpr[df_fpr["run"] == lbl].sort_values("bin_order") + c = _compare_color(i) + fig_fpr.add_trace( + go.Bar( + x=d["bin_label"], + y=d["fpr"], + name=lbl, + marker_color=c, + hovertemplate=f"{lbl}
%{{x}}
FP rate: %{{y:.2%}}", + ) + ) + align_x = df_fpr[df_fpr["run"] == labels[0]].sort_values("bin_order")["bin_label"].tolist() + _apply_detection_theme(fig_fpr, "FP rate by distance") + fig_fpr.update_layout( + xaxis_title="Distance bin", + yaxis_title="FP rate", + yaxis_range=[0, 1], + barmode="group", + xaxis=dict(tickangle=-35, categoryorder="array", categoryarray=align_x), + hovermode="x unified", + ) + fig_fpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + figures.append((fig_fpr, "Detection Stats compare distance panel in bar-chart mode: FP rate by distance.")) + + df_oc = _query_object_counts_compare(con, views) + if not df_oc.empty: + align_x = sorted(df_oc["bin_label"].unique(), key=_distance_bin_sort_key) + pivot_oc = df_oc.pivot_table(index="bin_label", columns="run", values="n", aggfunc="sum", fill_value=0).reindex(align_x, fill_value=0) + fig_oc = go.Figure() + for j, rl in enumerate([r for r in labels if r in pivot_oc.columns]): + c = _compare_color(j) + fig_oc.add_trace( + go.Bar( + x=align_x, + y=pivot_oc[rl].values, + name=str(rl), + marker_color=c, + hovertemplate=f"{rl}
%{{x}}
Count: %{{y:.0f}}", + ) + ) + _apply_detection_theme(fig_oc, "Object count by distance bin") + fig_oc.update_layout( + xaxis_title="Distance bin", + yaxis_title="Count", + barmode="group", + xaxis=dict(tickangle=-35, categoryorder="array", categoryarray=align_x), + hovermode="x unified", + ) + figures.append((fig_oc, "Detection Stats compare object-count-by-distance panel in bar-chart mode.")) + return figures + + +def _build_detection_tpr_figures( + con: duckdb.DuckDBPyConnection, + views: Sequence[Tuple[str, str]], +) -> List[Tuple[go.Figure, str]]: + figures: List[Tuple[go.Figure, str]] = [] + labels = [lbl for lbl, _ in views] + if len(views) == 1: + df_tpr = _query_tpr_by_label(con, views[0][1], _DEFAULT_MAX_EVAL_RANGE) + if df_tpr.empty: + return figures + fig = px.bar( + df_tpr, + x="label", + y="tpr", + title=f"Total TP rate within {_DEFAULT_MAX_EVAL_RANGE} [m]", + labels={"tpr": "TP Rate", "label": "Label"}, + color_discrete_sequence=[_COMPARE_RUN_COLORS[0]], + ) + fig.update_traces(marker_color=_COMPARE_RUN_COLORS[0]) + _apply_detection_theme(fig, f"Total TP rate within {_DEFAULT_MAX_EVAL_RANGE} [m]") + fig.update_layout(yaxis_range=[0, 1.2]) + fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") + figures.append((fig, "Default Detection Stats TP-rate panel: bar chart per object class.")) + return figures + + dfs = [] + for lbl, view_name in views: + df = _query_tpr_by_label(con, view_name, _DEFAULT_MAX_EVAL_RANGE) + if df.empty: + continue + df["run"] = lbl + dfs.append(df) + if not dfs: + return figures + df_all = pd.concat(dfs, ignore_index=True) + cats = sorted(df_all["label"].astype(str).unique()) + fig = _tpr_spider_compare_figure(df_all, cats, "TP rate (<=50 m)", labels, height=360) + figures.append((fig, "Default compare Detection Stats TP-rate panel: spider chart per object class.")) + return figures + + +def _build_detection_mean_error_figures( + con: duckdb.DuckDBPyConnection, + views: Sequence[Tuple[str, str]], +) -> List[Tuple[go.Figure, str]]: + figures: List[Tuple[go.Figure, str]] = [] + labels = [lbl for lbl, _ in views] + if not _views_have_error_columns(con, [view for _, view in views]): + return figures + if len(views) == 1: + df = _query_mean_error_by_label(con, views[0][1], _DEFAULT_MAX_EVAL_RANGE) + if df.empty: + return figures + fig = go.Figure() + fig.add_trace(go.Bar(x=df["label"], y=df["mean_abs_x_error"], name="X Error", marker_color=_compare_color(0))) + fig.add_trace(go.Bar(x=df["label"], y=df["mean_abs_y_error"], name="Y Error", marker_color=_compare_color(1))) + fig.add_trace(go.Bar(x=df["label"], y=df["mean_abs_yaw_error"], name="Yaw Error", marker_color=_compare_color(2))) + _apply_detection_theme(fig, f"Mean Error within {_DEFAULT_MAX_EVAL_RANGE} [m]") + fig.update_layout(xaxis_title="Label", yaxis_title="Error [m] or [rad]", barmode="group") + figures.append((fig, "Default Detection Stats mean-error panel: grouped bars for X/Y/Yaw.")) + return figures + + dfs = [] + for lbl, view_name in views: + df = _query_mean_error_by_label(con, view_name, _DEFAULT_MAX_EVAL_RANGE) + if df.empty: + continue + df["run"] = lbl + dfs.append(df) + if not dfs: + return figures + df_err_melt = pd.concat(dfs, ignore_index=True) + cats = sorted(df_err_melt["label"].astype(str).unique()) + err_specs = [ + ("Mean |x error| (within 50 m)", "mean_abs_x_error", "Mean |x error| (m)", ".3f"), + ("Mean |y error| (within 50 m)", "mean_abs_y_error", "Mean |y error| (m)", ".3f"), + ("Mean |yaw error| (within 50 m)", "mean_abs_yaw_error", "Mean |yaw error| (rad)", ".4f"), + ] + for chart_title, col, hover_lbl, tfmt in err_specs: + figures.append( + ( + _scalar_metric_spider_compare_figure(df_err_melt, cats, chart_title, labels, col, hover_lbl, height=400, tickformat=tfmt), + f"Default compare Detection Stats mean-error panel: spider chart for {hover_lbl}.", + ) + ) + return figures + + +def _query_distance_rates_single(con: duckdb.DuckDBPyConnection, view_name: str) -> pd.DataFrame: + query = f""" + WITH stats AS ( + SELECT + distance_bin, + COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) AS gt_total, + COUNT(*) FILTER (WHERE source='GT' AND status='TP') AS tp_gt, + COUNT(*) FILTER (WHERE source='EST' AND status IN ('TP','FP')) AS est_total, + COUNT(*) FILTER (WHERE source='EST' AND status='FP') AS fp_est + FROM {view_name} + GROUP BY distance_bin + ) + SELECT + distance_bin, + CASE WHEN gt_total > 0 THEN CAST(tp_gt AS DOUBLE) / gt_total ELSE 0 END AS tpr, + CASE WHEN est_total > 0 THEN CAST(fp_est AS DOUBLE) / est_total ELSE 0 END AS fpr + FROM stats + """ + df = con.execute(query).df() + return _decorate_distance_bins(df) + + +def _query_distance_rates_compare( + con: duckdb.DuckDBPyConnection, + views: Sequence[Tuple[str, str]], + *, + metric: str, +) -> pd.DataFrame: + frames = [] + for lbl, view_name in views: + query = f""" + WITH stats AS ( + SELECT + distance_bin, + COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) AS gt_total, + COUNT(*) FILTER (WHERE source='GT' AND status='TP') AS tp_gt, + COUNT(*) FILTER (WHERE source='EST' AND status IN ('TP','FP')) AS est_total, + COUNT(*) FILTER (WHERE source='EST' AND status='FP') AS fp_est + FROM {view_name} + GROUP BY distance_bin + ) + SELECT + distance_bin, + CASE + WHEN {'gt_total' if metric == 'tpr' else 'est_total'} > 0 + THEN CAST({'tp_gt' if metric == 'tpr' else 'fp_est'} AS DOUBLE) / {'gt_total' if metric == 'tpr' else 'est_total'} + ELSE 0 + END AS {metric} + FROM stats + """ + df = con.execute(query).df() + if df.empty: + continue + df["run"] = lbl + frames.append(_decorate_distance_bins(df)) + if not frames: + return pd.DataFrame() + return pd.concat(frames, ignore_index=True) + + +def _query_object_counts_single(con: duckdb.DuckDBPyConnection, view_name: str) -> pd.DataFrame: + query = f""" + SELECT distance_bin, label, COUNT(*) AS n + FROM {view_name} + GROUP BY distance_bin, label + """ + return _decorate_distance_bins(con.execute(query).df()) + + +def _query_object_counts_compare(con: duckdb.DuckDBPyConnection, views: Sequence[Tuple[str, str]]) -> pd.DataFrame: + frames = [] + for lbl, view_name in views: + query = f""" + SELECT distance_bin, COUNT(*) AS n + FROM {view_name} + GROUP BY distance_bin + """ + df = con.execute(query).df() + if df.empty: + continue + df["run"] = lbl + frames.append(_decorate_distance_bins(df)) + if not frames: + return pd.DataFrame() + return pd.concat(frames, ignore_index=True) + + +def _query_tpr_by_label(con: duckdb.DuckDBPyConnection, view_name: str, max_range: int) -> pd.DataFrame: + query = f""" + SELECT + label, + CASE + WHEN COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) > 0 + THEN CAST(COUNT(*) FILTER (WHERE source='GT' AND status='TP') AS DOUBLE) + / COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) + ELSE 0 + END AS tpr + FROM {view_name} + WHERE dist_h < {int(max_range)} + GROUP BY label + ORDER BY label + """ + return con.execute(query).df() + + +def _query_mean_error_by_label(con: duckdb.DuckDBPyConnection, view_name: str, max_range: int) -> pd.DataFrame: + query = f""" + SELECT + label, + AVG(ABS(CAST(x_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND x_error IS NOT NULL) AS mean_abs_x_error, + AVG(ABS(CAST(y_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND y_error IS NOT NULL) AS mean_abs_y_error, + AVG(ABS(CAST(yaw_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND yaw_error IS NOT NULL) AS mean_abs_yaw_error + FROM {view_name} + WHERE dist_h < {int(max_range)} + GROUP BY label + ORDER BY label + """ + return con.execute(query).df() + + +def _build_detection_perception_diff_figures( + con: duckdb.DuckDBPyConnection, + views: Sequence[Tuple[str, str]], +) -> List[Tuple[go.Figure, str]]: + if len(views) < 2: + return [] + figures: List[Tuple[go.Figure, str]] = [] + base_view = views[0][1] + for lbl, comp_view in views[1:]: + df_obj = _query_perception_diff_objects(con, base_view, comp_view) + if df_obj.empty: + continue + h_imp = _baobab_hierarchy_from_objects(df_obj, "improved", f"Improved ({lbl} vs A)", 15, 10) + h_deg = _baobab_hierarchy_from_objects(df_obj, "degraded", f"Degraded ({lbl} vs A)", 15, 10) + if not h_imp.empty and "n" in h_imp.columns: + fig_imp = px.sunburst( + h_imp, + path=["root", "scen_g", "fr_display", "label"], + values="n", + color="n", + color_continuous_scale=[[0.0, "#f7fcf5"], [1.0, "#1a9850"]], + title=f"Sunburst: improved (n = {int(h_imp['n'].sum())} GT objects)", + ) + _apply_detection_theme(fig_imp, f"Sunburst: improved ({lbl} vs A)") + figures.append((fig_imp, f"Perception diff sunburst for improved objects: {lbl} vs baseline A.")) + if not h_deg.empty and "n" in h_deg.columns: + fig_deg = px.sunburst( + h_deg, + path=["root", "scen_g", "fr_display", "label"], + values="n", + color="n", + color_continuous_scale=[[0.0, "#fff5f0"], [1.0, "#d73027"]], + title=f"Sunburst: degraded (n = {int(h_deg['n'].sum())} GT objects)", + ) + _apply_detection_theme(fig_deg, f"Sunburst: degraded ({lbl} vs A)") + figures.append((fig_deg, f"Perception diff sunburst for degraded objects: {lbl} vs baseline A.")) + + df_by_label, scen_agg, df_frame_sorted = _query_perception_diff_lens_tables(con, base_view, comp_view) + root_lens = f"{lbl} vs A" + if not df_by_label.empty: + tdf_l = _comparison_lens_treemap_df( + df_by_label["label"], + df_by_label["improved_cnt"], + df_by_label["degraded_cnt"], + root_lens, + ) + fig_l = _comparison_lens_treemap_figure(tdf_l, "By class") + if fig_l is not None: + figures.append((fig_l, f"Perception diff comparison lens by class: {lbl} vs baseline A.")) + if not scen_agg.empty: + tdf_s = _comparison_lens_treemap_df( + scen_agg["scenario_name"].astype(str), + scen_agg["improved_cnt"], + scen_agg["degraded_cnt"], + root_lens, + ) + fig_s = _comparison_lens_treemap_figure(tdf_s, "By scenario") + if fig_s is not None: + figures.append((fig_s, f"Perception diff comparison lens by scenario: {lbl} vs baseline A.")) + if not df_frame_sorted.empty: + fr_cap = 36 + fr_top = df_frame_sorted.head(fr_cap).copy() + nms = (fr_top["scenario_name"].astype(str).str.slice(0, 26) + "\n· f" + fr_top["frame_index"].astype(str)).tolist() + ims = fr_top["improved_cnt"].astype(float).tolist() + dgs = fr_top["degraded_cnt"].astype(float).tolist() + rest = df_frame_sorted.iloc[fr_cap:] + if not rest.empty: + io = float(rest["improved_cnt"].sum()) + do = float(rest["degraded_cnt"].sum()) + if io > 0 or do > 0: + nms.append(f"Other frames\n({len(rest)} frames)") + ims.append(io) + dgs.append(do) + tdf_f = _comparison_lens_treemap_df(pd.Series(nms), pd.Series(ims), pd.Series(dgs), root_lens) + fig_f = _comparison_lens_treemap_figure(tdf_f, "By frame") + if fig_f is not None: + figures.append((fig_f, f"Perception diff comparison lens by frame: {lbl} vs baseline A.")) + return figures + + +def _query_perception_diff_objects( + con: duckdb.DuckDBPyConnection, + base_view: str, + comp_view: str, +) -> pd.DataFrame: + query = f""" + WITH base_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM {base_view} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + GROUP BY 1, 2, 3 + ), + comp_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM {comp_view} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + GROUP BY 1, 2, 3 + ), + joined AS ( + SELECT + COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, + COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, + COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, + COALESCE(b.tp_base, FALSE) AS tp_base, + COALESCE(c.tp_comp, FALSE) AS tp_comp, + COALESCE(b.suite_name, c.suite_name, '') AS suite_name, + COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, + COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name + FROM base_gt b + FULL OUTER JOIN comp_gt c + ON b.t4dataset_id = c.t4dataset_id + AND b.frame_index = c.frame_index + AND b.gt_uuid = c.gt_uuid + ), + obj_attrs AS ( + SELECT + t4dataset_id, + frame_index, + uuid, + MAX(CAST(label AS VARCHAR)) AS label, + MAX(dist_h) AS dist_h + FROM {base_view} + WHERE source = 'GT' + GROUP BY 1, 2, 3 + ) + SELECT + j.t4dataset_id, + j.frame_index, + j.gt_uuid, + COALESCE(e.label, '') AS label, + COALESCE(e.dist_h, 0.0) AS dist_h, + {_DISTANCE_BIN_CASE.replace("dist_h", "COALESCE(e.dist_h, 0.0)")} AS distance_bin, + j.suite_name, + j.scenario_name, + j.t4dataset_name, + CASE + WHEN NOT j.tp_base AND j.tp_comp THEN 'improved' + WHEN j.tp_base AND NOT j.tp_comp THEN 'degraded' + WHEN j.tp_base AND j.tp_comp THEN 'both_tp' + ELSE 'both_fn' + END AS change_type, + j.tp_base, + j.tp_comp + FROM joined j + LEFT JOIN obj_attrs e + ON CAST(j.t4dataset_id AS VARCHAR) = CAST(e.t4dataset_id AS VARCHAR) + AND j.frame_index = CAST(e.frame_index AS VARCHAR) + AND j.gt_uuid = e.uuid + ORDER BY change_type, j.t4dataset_id, j.frame_index + """ + try: + return con.execute(query).df() + except Exception: + return pd.DataFrame() + + +def _query_perception_diff_lens_tables( + con: duckdb.DuckDBPyConnection, + base_view: str, + comp_view: str, +) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + query_label = f""" + WITH base_gt AS ( + SELECT t4dataset_id, frame_index, uuid AS gt_uuid, COALESCE(MAX(try_cast(label AS VARCHAR)), '') AS label, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base + FROM {base_view} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + GROUP BY 1, 2, 3 + ), + comp_gt AS ( + SELECT t4dataset_id, frame_index, uuid AS gt_uuid, COALESCE(MAX(try_cast(label AS VARCHAR)), '') AS label, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp + FROM {comp_view} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + GROUP BY 1, 2, 3 + ), + joined AS ( + SELECT COALESCE(b.label, c.label) AS label, COALESCE(b.tp_base, FALSE) AS tp_base, COALESCE(c.tp_comp, FALSE) AS tp_comp + FROM base_gt b FULL OUTER JOIN comp_gt c + ON b.t4dataset_id = c.t4dataset_id AND b.frame_index = c.frame_index AND b.gt_uuid = c.gt_uuid + ) + SELECT label, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt + FROM joined + GROUP BY label + """ + query_frame = f""" + WITH base_gt AS ( + SELECT t4dataset_id, frame_index, uuid AS gt_uuid, COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name + FROM {base_view} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + GROUP BY 1,2,3 + ), + comp_gt AS ( + SELECT t4dataset_id, frame_index, uuid AS gt_uuid, COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name + FROM {comp_view} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + GROUP BY 1,2,3 + ), + joined AS ( + SELECT COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, + COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, + COALESCE(b.tp_base, FALSE) AS tp_base, + COALESCE(c.tp_comp, FALSE) AS tp_comp, + COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name + FROM base_gt b FULL OUTER JOIN comp_gt c + ON b.t4dataset_id = c.t4dataset_id AND b.frame_index = c.frame_index AND b.gt_uuid = c.gt_uuid + ) + SELECT t4dataset_id, frame_index, scenario_name, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt + FROM joined + GROUP BY t4dataset_id, frame_index, scenario_name + ORDER BY degraded_cnt DESC, improved_cnt DESC + """ + query_scenario = f""" + WITH base_gt AS ( + SELECT t4dataset_id, frame_index, uuid AS gt_uuid, COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name + FROM {base_view} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + GROUP BY 1,2,3 + ), + comp_gt AS ( + SELECT t4dataset_id, frame_index, uuid AS gt_uuid, COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name + FROM {comp_view} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + GROUP BY 1,2,3 + ), + joined AS ( + SELECT COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, + COALESCE(b.tp_base, FALSE) AS tp_base, + COALESCE(c.tp_comp, FALSE) AS tp_comp + FROM base_gt b FULL OUTER JOIN comp_gt c + ON b.t4dataset_id = c.t4dataset_id AND b.frame_index = c.frame_index AND b.gt_uuid = c.gt_uuid + ) + SELECT scenario_name, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt + FROM joined + GROUP BY scenario_name + ORDER BY degraded_cnt DESC, improved_cnt DESC + """ + try: + df_label = con.execute(query_label).df() + except Exception: + df_label = pd.DataFrame() + try: + df_scenario = con.execute(query_scenario).df() + except Exception: + df_scenario = pd.DataFrame() + try: + df_frame = con.execute(query_frame).df() + except Exception: + df_frame = pd.DataFrame() + return df_label, df_scenario, df_frame + + +def _baobab_hierarchy_from_objects( + df_obj: pd.DataFrame, + change_type: str, + root_label: str, + max_scenarios: int, + max_frames: int, +) -> pd.DataFrame: + if df_obj.empty or "change_type" not in df_obj.columns: + return pd.DataFrame() + sub = df_obj[df_obj["change_type"] == change_type].copy() + if sub.empty: + return pd.DataFrame() + sub["scenario_name"] = sub["scenario_name"].fillna("").astype(str).replace("", "(no scenario)") + sub["label"] = sub["label"].fillna("").astype(str).replace("", "(no label)") + sub["frame_key"] = sub["t4dataset_id"].astype(str) + "|f" + sub["frame_index"].astype(str) + leaf = sub.groupby(["scenario_name", "frame_key", "label"], dropna=False).size().reset_index(name="n") + scen_tot = leaf.groupby("scenario_name")["n"].sum().sort_values(ascending=False) + top_scen = set(scen_tot.head(max_scenarios).index.tolist()) + leaf["scen_g"] = leaf["scenario_name"].where(leaf["scenario_name"].isin(top_scen), "Other scenarios") + out_parts = [] + for _, g in leaf.groupby("scen_g"): + fr_tot = g.groupby("frame_key")["n"].sum().sort_values(ascending=False) + top_fr = set(fr_tot.head(max_frames).index.tolist()) + g2 = g.copy() + g2["fr_g"] = g2["frame_key"].where(g2["frame_key"].isin(top_fr), "Other frames") + agg = g2.groupby(["scen_g", "fr_g", "label"], as_index=False)["n"].sum() + out_parts.append(agg) + out = pd.concat(out_parts, ignore_index=True) + out["root"] = root_label + out["fr_display"] = out["fr_g"].astype(str) + return out + + +def _comparison_lens_treemap_df(names: pd.Series, improved: pd.Series, degraded: pd.Series, root_label: str) -> pd.DataFrame: + rows = [] + for name, imp, deg in zip(names.astype(str), improved.astype(float), degraded.astype(float)): + if imp > 0: + rows.append({"root": root_label, "side": "Improved", "item": name, "n": float(imp)}) + if deg > 0: + rows.append({"root": root_label, "side": "Degraded", "item": name, "n": float(deg)}) + if not rows: + return pd.DataFrame(columns=["root", "side", "item", "n"]) + return pd.DataFrame(rows) + + +def _comparison_lens_treemap_figure(tdf: pd.DataFrame, title: str) -> Optional[go.Figure]: + if tdf.empty or "n" not in tdf.columns: + return None + fig = px.treemap( + tdf, + path=["root", "side", "item"], + values="n", + color="side", + color_discrete_map={"Improved": "#1a9850", "Degraded": "#d73027"}, + ) + fig.update_traces( + textfont_size=12, + textinfo="label+value+percent parent", + hovertemplate=("%{label}
GT objects: %{value:.0f}
% of parent: %{percentParent}"), + marker_line_width=1.5, + marker_line_color="rgba(255,255,255,0.45)", + root_color="rgba(240,240,245,0.95)", + ) + _apply_detection_theme(fig, title) + fig.update_layout(height=430, margin=dict(t=20, l=2, r=2, b=2), paper_bgcolor="rgba(0,0,0,0)") + return fig + + +def _views_have_error_columns(con: duckdb.DuckDBPyConnection, view_names: Sequence[str]) -> bool: + if not view_names: + return False + sample_df = con.execute(f"SELECT * FROM {view_names[0]} LIMIT 1").df() + return all(col in sample_df.columns for col in ["x_error", "y_error", "yaw_error"]) + + +def _decorate_distance_bins(df: pd.DataFrame) -> pd.DataFrame: + if df.empty or "distance_bin" not in df.columns: + return df + df = df.copy() + df["bin_order"] = df["distance_bin"].map(_distance_bin_sort_key) + df["bin_label"] = df["distance_bin"] + return df.sort_values("bin_order") + + +def _distance_bin_sort_key(label: str) -> int: + try: + return _distance_bin_order().index(str(label)) + except ValueError: + return len(_distance_bin_order()) + 1 + + +def _compare_color(index: int) -> str: + return _COMPARE_RUN_COLORS[index % len(_COMPARE_RUN_COLORS)] + + +def _tpr_spider_compare_figure( + df_all: pd.DataFrame, + categories: List[str], + title: str, + run_order: List[str], + *, + height: int = 440, +) -> go.Figure: + fig = go.Figure() + for i, run_lbl in enumerate(run_order): + sub = df_all[df_all["run"] == run_lbl].drop_duplicates("label").set_index("label") + r_vals = [float(sub.loc[c, "tpr"]) if c in sub.index else 0.0 for c in categories] + r_closed = r_vals + r_vals[:1] + theta = categories + categories[:1] + c = _compare_color(i) + fig.add_trace( + go.Scatterpolar( + r=r_closed, + theta=theta, + name=str(run_lbl), + line=dict(color=c, width=2), + fillcolor=f"rgba({int(c[1:3],16)},{int(c[3:5],16)},{int(c[5:7],16)},0.12)", + fill="toself", + hovertemplate="%{theta}
TP rate: %{r:.2%}", + ) + ) + _apply_detection_theme(fig, title) + fig.update_layout( + height=height, + polar=dict( + radialaxis=dict(visible=True, range=[0, 1], tickformat=".0%", gridcolor="rgba(0,0,0,0.08)"), + angularaxis=dict(tickfont=dict(size=10)), + ), + legend=dict(orientation="h", yanchor="bottom", y=-0.12, xanchor="center", x=0.5), + ) + return fig + + +def _scalar_metric_spider_compare_figure( + df_all: pd.DataFrame, + categories: List[str], + title: str, + run_order: List[str], + value_col: str, + hover_metric: str, + *, + height: int = 380, + tickformat: str = ".3f", +) -> go.Figure: + fig = go.Figure() + max_r = 0.0 + traces_r: List[List[float]] = [] + for run_lbl in run_order: + sub = df_all[df_all["run"] == run_lbl].drop_duplicates("label").set_index("label") + r_vals = [float(sub.loc[c, value_col]) if c in sub.index and pd.notna(sub.loc[c, value_col]) else 0.0 for c in categories] + traces_r.append(r_vals) + if r_vals: + max_r = max(max_r, max(r_vals)) + r_max = max(max_r * 1.08, 1.0) + for i, run_lbl in enumerate(run_order): + r_vals = traces_r[i] + r_closed = r_vals + r_vals[:1] + theta = categories + categories[:1] + c = _compare_color(i) + fig.add_trace( + go.Scatterpolar( + r=r_closed, + theta=theta, + name=str(run_lbl), + line=dict(color=c, width=2), + fillcolor=f"rgba({int(c[1:3],16)},{int(c[3:5],16)},{int(c[5:7],16)},0.12)", + fill="toself", + hovertemplate="%{theta}
" + hover_metric + ": %{r:" + tickformat + "}", + ) + ) + _apply_detection_theme(fig, title) + fig.update_layout( + height=height, + polar=dict( + radialaxis=dict(visible=True, range=[0, r_max], tickformat=tickformat, gridcolor="rgba(0,0,0,0.08)"), + angularaxis=dict(tickfont=dict(size=9)), + ), + legend=dict(orientation="h", yanchor="bottom", y=-0.18, xanchor="center", x=0.5), + ) + return fig + + +def _make_text_placeholder_figure(text: str) -> go.Figure: + fig = go.Figure() + fig.add_annotation(text=text, x=0.5, y=0.5, xref="paper", yref="paper", showarrow=False, font=dict(size=16, color="#475569")) + fig.update_xaxes(visible=False) + fig.update_yaxes(visible=False) + fig.update_layout( + height=240, + template="plotly_white", + margin=dict(t=20, b=20, l=20, r=20), + paper_bgcolor="rgba(248,250,252,0.9)", + plot_bgcolor="rgba(255,255,255,0.95)", + ) + return fig + + +def _build_score_view(df_raw: pd.DataFrame, criteria_idx: int) -> pd.DataFrame: + return build_score_view(df_raw, criteria_idx) + + +def _create_eval_flat_view(con: duckdb.DuckDBPyConnection, parquet_path: str, view_name: str) -> None: + query = f""" + CREATE OR REPLACE VIEW {view_name} AS + WITH src AS ( + SELECT * FROM parquet_scan('{parquet_path}') + UNION BY NAME + SELECT CAST(NULL AS VARCHAR) AS visibility, + CAST(NULL AS VARCHAR) AS suite_name, + CAST(NULL AS VARCHAR) AS scenario_name, + CAST(NULL AS VARCHAR) AS t4dataset_name + WHERE FALSE + ), + base AS ( + SELECT + * REPLACE (coalesce(CAST(visibility AS VARCHAR), 'not available') AS visibility), + sqrt(CAST(x AS DOUBLE)*CAST(x AS DOUBLE) + CAST(y AS DOUBLE)*CAST(y AS DOUBLE)) AS dist_h + FROM src + WHERE x IS NOT NULL AND y IS NOT NULL + ) + SELECT + *, + {_DISTANCE_BIN_CASE} AS distance_bin + FROM base + """ + con.execute(query) + + +def _kpi_row_for_view(con: duckdb.DuckDBPyConnection, view_name: str) -> Optional[dict]: + query = f""" + SELECT + COUNT(*) FILTER (WHERE source = 'GT' AND status = 'TP') AS tp_gt, + COUNT(*) FILTER (WHERE source = 'GT' AND status = 'FN') AS fn, + COUNT(*) FILTER (WHERE source = 'EST' AND status = 'TP') AS tp_est, + COUNT(*) FILTER (WHERE source = 'EST' AND status = 'FP') AS fp + FROM {view_name} + WHERE dist_h < 50 + """ + row = con.execute(query).fetchone() + if not row: + return None + tp_gt, fn, tp_est, fp = [int(x or 0) for x in row] + gt_total = tp_gt + fn + est_total = tp_est + fp + tpr = (tp_gt / gt_total) if gt_total > 0 else None + precision = (tp_est / est_total) if est_total > 0 else None + recall = tpr + if precision is not None and recall is not None and (precision + recall) > 0: + f1 = 2 * precision * recall / (precision + recall) + else: + f1 = None + return { + "tp": tp_gt, + "fp": fp, + "fn": fn, + "tpr": tpr, + "precision": precision, + "f1": f1, + } + + +def _query_status_counts(con: duckdb.DuckDBPyConnection, views: Sequence[Tuple[str, str]]) -> pd.DataFrame: + parts = [ + f"SELECT '{lbl}' AS run, label, status, COUNT(*) AS num " + f"FROM {view_name} WHERE dist_h < 50 GROUP BY label, status" + for lbl, view_name in views + ] + if not parts: + return pd.DataFrame() + query = " UNION ALL ".join(parts) + " ORDER BY run, label, status" + return con.execute(query).df() + + +def _query_distance_tpr(con: duckdb.DuckDBPyConnection, views: Sequence[Tuple[str, str]]) -> pd.DataFrame: + frames = [] + for lbl, view_name in views: + query = f""" + WITH stats AS ( + SELECT + distance_bin, + COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) AS gt_total, + COUNT(*) FILTER (WHERE source='GT' AND status='TP') AS tp_gt + FROM {view_name} + WHERE dist_h < 150 + GROUP BY distance_bin + ) + SELECT + '{lbl}' AS run, + distance_bin, + CASE WHEN gt_total > 0 THEN CAST(tp_gt AS DOUBLE) / gt_total ELSE 0 END AS tpr + FROM stats + """ + frames.append(con.execute(query).df()) + if not frames: + return pd.DataFrame() + return pd.concat(frames, ignore_index=True) + + +def _apply_criteria_theme(fig: go.Figure, title: str) -> None: + fig.update_layout( + template="plotly_white", + title=dict(text=title, font=dict(size=16, color="#0f172a"), x=0, xanchor="left", pad=dict(t=8, b=12)), + font=dict(family="system-ui, -apple-system, 'Segoe UI', sans-serif", size=12, color="#334155"), + paper_bgcolor="rgba(248, 250, 252, 0.92)", + plot_bgcolor="rgba(255, 255, 255, 0.95)", + margin=dict(l=56, r=28, t=72, b=52), + height=420, + hoverlabel=dict(bgcolor="white", font_size=13, font_family="system-ui"), + legend=dict( + title_text="", + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1, + bgcolor="rgba(255,255,255,0.7)", + ), + ) + fig.update_xaxes(showgrid=True, gridcolor="rgba(148,163,184,0.25)", zeroline=False) + fig.update_yaxes(showgrid=True, gridcolor="rgba(148,163,184,0.25)", zeroline=False) + + +def _apply_detection_theme(fig: go.Figure, title: str) -> None: + fig.update_layout( + title=dict(text=title, font=dict(size=14, color="#1f2937")), + font=dict(family='"Inter", "Segoe UI", sans-serif', size=11), + paper_bgcolor="rgba(0,0,0,0)", + plot_bgcolor="rgba(248,250,252,0.6)", + margin=dict(t=48, b=40, l=52, r=24), + height=390, + legend=dict( + orientation="h", + yanchor="bottom", + y=1.02, + xanchor="right", + x=1, + font=dict(size=11), + ), + ) + fig.update_xaxes( + tickfont=dict(size=11), + title_font=dict(size=12), + gridcolor="rgba(0,0,0,0.08)", + zeroline=True, + zerolinecolor="rgba(0,0,0,0.15)", + ) + fig.update_yaxes( + tickfont=dict(size=11), + title_font=dict(size=12), + gridcolor="rgba(0,0,0,0.08)", + zeroline=True, + zerolinecolor="rgba(0,0,0,0.15)", + ) + + +def _plotly_figure_to_image(fig: go.Figure, content_width: float, image_reader_cls): + from reportlab.platypus import Image + + png_bytes = fig.to_image(format="png", width=1400, height=800, scale=2) + image_buffer = io.BytesIO(png_bytes) + reader = image_reader_cls(image_buffer) + img_width, img_height = reader.getSize() + target_width = content_width + target_height = target_width * (img_height / img_width) + image_buffer.seek(0) + return Image(image_buffer, width=target_width, height=target_height) + + +def _styled_table(rows: Any, content_width: float): + from reportlab.lib import colors + from reportlab.lib.styles import getSampleStyleSheet + from reportlab.platypus import Table, TableStyle + + col_width_weights = None + if isinstance(rows, dict): + col_width_weights = rows.get("col_width_weights") + rows = rows.get("rows", []) + if not rows: + rows = [["No data"]] + ncols = max(len(row) for row in rows) + styles = getSampleStyleSheet() + header_style = styles["BodyText"].clone("table_header") + header_style.fontName = "Helvetica-Bold" + header_style.fontSize = 8.5 + header_style.leading = 10 + body_style = styles["BodyText"].clone("table_body") + body_style.fontName = "Helvetica" + body_style.fontSize = 8.2 + body_style.leading = 9.6 + body_style.textColor = colors.HexColor("#0f172a") + normalized = [] + for row_idx, row in enumerate(rows): + padded = list(row) + [""] * (ncols - len(row)) + cell_style = header_style if row_idx == 0 else body_style + normalized.append([ + _table_paragraph(cell, cell_style) + for cell in padded + ]) + if col_width_weights and len(col_width_weights) == ncols: + total = sum(col_width_weights) or 1.0 + col_widths = [content_width * (w / total) for w in col_width_weights] + else: + col_width = content_width / ncols + col_widths = [col_width] * ncols + table = Table(normalized, colWidths=col_widths, repeatRows=1) + table.setStyle( + TableStyle( + [ + ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#e2e8f0")), + ("TEXTCOLOR", (0, 0), (-1, 0), colors.HexColor("#0f172a")), + ("GRID", (0, 0), (-1, -1), 0.4, colors.HexColor("#cbd5e1")), + ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.white, colors.HexColor("#f8fafc")]), + ("VALIGN", (0, 0), (-1, -1), "TOP"), + ("LEFTPADDING", (0, 0), (-1, -1), 6), + ("RIGHTPADDING", (0, 0), (-1, -1), 6), + ("TOPPADDING", (0, 0), (-1, -1), 5), + ("BOTTOMPADDING", (0, 0), (-1, -1), 5), + ] + ) + ) + return table + + +def _distance_bin_order() -> List[str]: + return [ + "[0,10)", + "[10,20)", + "[20,30)", + "[30,40)", + "[40,50)", + "[50,60)", + "[60,70)", + "[70,80)", + "[80,90)", + "[90,100)", + "[100,110)", + "[110,120)", + "[120,130)", + "[130,140)", + "[140,150)", + "[150,inf)", + ] + + +def _ensure_reportlab_available() -> Optional[str]: + try: + import reportlab # noqa: F401 + except ImportError: + return "PDF export requires the `reportlab` package to be installed." + return None + + +def _slugify(value: str) -> str: + clean = "".join(ch.lower() if ch.isalnum() else "_" for ch in str(value)) + while "__" in clean: + clean = clean.replace("__", "_") + return clean.strip("_") or "report" + + +def _fmt_number(value: Any) -> str: + if value is None or pd.isna(value): + return "N/A" + return f"{float(value):.2f}" + + +def _fmt_percent(value: Any) -> str: + if value is None or pd.isna(value): + return "N/A" + return f"{100.0 * float(value):.1f}%" + + +def _summarize_filter_values(values: Optional[Iterable[Any]], *, empty_label: str = "All") -> str: + if values is None: + return empty_label + vals = [str(v) for v in values if str(v).strip() != ""] + if not vals: + return empty_label + if len(vals) <= 6: + return ", ".join(vals) + return ", ".join(vals[:6]) + f", ... (+{len(vals) - 6} more)" + + +def _shorten_scenario_name(value: str, *, max_len: int = 52) -> str: + text = str(value) + if len(text) <= max_len: + return text + return text[: max_len - 3] + "..." + + +def _table_paragraph(value: Any, style: Any): + from reportlab.platypus import Paragraph + + text = html.escape("" if value is None else str(value)).replace("\n", "
") + return Paragraph(text, style) diff --git a/evaluation_dashboard_app/lib/overview_url_hydrate.py b/evaluation_dashboard_app/lib/overview_url_hydrate.py new file mode 100644 index 0000000..8f9fc1a --- /dev/null +++ b/evaluation_dashboard_app/lib/overview_url_hydrate.py @@ -0,0 +1,74 @@ +""" +Rehydrate session_state from Overview URL query params when server-side session is empty. + +Overview syncs `mode`, `run_a`, `run_b`, ... via `st.query_params`. After a load-balancer hop to a +different Streamlit replica, `st.session_state` may not contain `runA` even though the user already +used Overview — the URL still encodes the selection. This module rebuilds `runA` / compare state +from that URL so multipage analysis works without requiring Overview to run again on the same box. +""" + +from __future__ import annotations + +import streamlit as st + +from lib.path_utils import get_data_root, get_run_display_name, get_run_storage_name, list_run_directories +from lib.run_loader import load_run + + +def try_hydrate_session_from_overview_query_params() -> bool: + """ + If `runA` is missing but the URL has Overview-style params (`run_a`, optional `mode` / `run_b`…), + load runs and populate `session_state`. Returns True if `runA` is present afterward. + """ + if "runA" in st.session_state: + return True + params = st.query_params + run_a_name = params.get("run_a") + if not run_a_name: + return False + root = get_data_root() + if not root.exists() or not root.is_dir(): + return False + run_dirs = list_run_directories() + name_to_dir = {get_run_display_name(p): p for p in run_dirs} + name_to_dir.update({get_run_storage_name(p): p for p in run_dirs}) + if run_a_name not in name_to_dir: + return False + mode_param = (params.get("mode") or "single").lower() + try: + if mode_param == "compare": + url_compare = [ + params.get(k) + for k in ("run_b", "run_c", "run_d", "run_e") + if params.get(k) + ] + valid = [n for n in url_compare if n in name_to_dir] + if not valid: + return False + run_a_dir = name_to_dir[run_a_name] + compare_dirs = [name_to_dir[n] for n in valid] + all_dirs = [run_a_dir] + compare_dirs + run_labels = ["A"] + [chr(66 + i) for i in range(len(compare_dirs))] + all_runs = [load_run(d) for d in all_dirs] + st.session_state.update( + { + "mode": "Compare Mode", + "runA": all_runs[0], + "all_runs": all_runs, + "run_labels": run_labels, + "df_cmp": None, + } + ) + if len(all_runs) >= 2: + st.session_state["runB"] = all_runs[1] + else: + st.session_state["runB"] = None + return True + run_a = load_run(name_to_dir[run_a_name]) + st.session_state["runA"] = run_a + st.session_state["mode"] = "Single Mode" + for key in ("all_runs", "run_labels", "runB", "df_cmp"): + st.session_state.pop(key, None) + return True + except Exception: + return False diff --git a/evaluation_dashboard_app/lib/page_chrome.py b/evaluation_dashboard_app/lib/page_chrome.py index 5bd6e08..d316661 100644 --- a/evaluation_dashboard_app/lib/page_chrome.py +++ b/evaluation_dashboard_app/lib/page_chrome.py @@ -76,7 +76,7 @@ def render_loaded_data_section(entries: Sequence[Tuple[str, str]]) -> None: f"""
{la}
-
{pa}
+
{pa}
""", unsafe_allow_html=True, @@ -95,7 +95,7 @@ def render_loaded_data_section(entries: Sequence[Tuple[str, str]]) -> None: f"""
{la}
-
{pa}
+
{pa}
""", unsafe_allow_html=True, diff --git a/evaluation_dashboard_app/lib/path_utils.py b/evaluation_dashboard_app/lib/path_utils.py index ca698a4..5fa77a8 100644 --- a/evaluation_dashboard_app/lib/path_utils.py +++ b/evaluation_dashboard_app/lib/path_utils.py @@ -9,9 +9,12 @@ """ import os +import re from pathlib import Path from typing import Optional, List, Tuple +import yaml + # Root for all evaluation data. Set EVAL_DASHBOARD_DATA_ROOT to override (e.g. /var/eval_dashboard/data). _DATA_ROOT: Optional[Path] = None @@ -112,12 +115,122 @@ def resolve_under_data_root( return None, str(e) +def _looks_like_analysis_run(path: Path) -> bool: + return ( + (path / "Summary.csv").exists() + or (path / "Score.csv").exists() + or any(path.glob("*.parquet")) + or (path / "current.csv").exists() + or (path / "future.csv").exists() + ) + + +def _is_internal_trend_release_dir(path: Path) -> bool: + return path.name.startswith("trend_release_") + + +RELEASE_ROLE_DIRS = ("performance", "usecase", "devops") +RELEASE_ROLE_LABELS = { + "performance": "Performance", + "usecase": "Usecase", + "devops": "DevOps", +} +_PILOT_AUTO_PREFIX_PATTERN = re.compile(r"^\s*Pilot\.Auto\s*", re.IGNORECASE) + + +def _looks_like_release_container(path: Path) -> bool: + return ( + (path / "metadata.yaml").exists() + and any((path / name).is_dir() for name in RELEASE_ROLE_DIRS) + and not _looks_like_analysis_run(path) + ) + + +def _load_yaml_metadata(path: Path) -> dict: + if not path.is_file(): + return {} + try: + with path.open("r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + except (OSError, yaml.YAMLError): + return {} + return data if isinstance(data, dict) else {} + + +def _compact_release_version(metadata: dict, fallback: str) -> str: + version = str(metadata.get("version_abbr") or metadata.get("pilot_auto_version") or "").strip() + if not version: + return fallback + version = _PILOT_AUTO_PREFIX_PATTERN.sub("", version).strip() or version + version = version.replace("/", "-") + return version + + +def _release_run_display_name(run_path: Path) -> Optional[str]: + role_label = "" + release_dir = run_path + if run_path.name in RELEASE_ROLE_LABELS and _looks_like_release_container(run_path.parent): + release_dir = run_path.parent + role_label = RELEASE_ROLE_LABELS[run_path.name] + elif _looks_like_release_container(run_path): + role_label = "Release" + else: + return None + + metadata = _load_yaml_metadata(run_path / "metadata.yaml") or _load_yaml_metadata(release_dir / "metadata.yaml") + version = _compact_release_version(metadata, release_dir.name.replace("release_spec_", "")) + date = str(metadata.get("date") or "").strip() + parts = [f"[REL] {version}"] + if role_label: + parts.append(role_label) + if date: + parts.append(date) + return " | ".join(parts) + + +def get_run_display_name(run_path: Path) -> str: + """Return a stable user-facing run selector name.""" + release_name = _release_run_display_name(run_path) + if release_name: + return release_name + root = get_data_root() + try: + return run_path.resolve().relative_to(root).as_posix() + except Exception: + return run_path.name + + +def get_run_storage_name(run_path: Path) -> str: + """Return the raw path-like run name relative to the data root.""" + root = get_data_root() + try: + return run_path.resolve().relative_to(root).as_posix() + except Exception: + return run_path.name + + def list_run_directories() -> List[Path]: - """Return sorted list of run directories (immediate subdirs of data root) that exist.""" + """Return sorted run directories, including release analysis children.""" root = get_data_root() if not root.exists(): return [] - return sorted([p for p in root.iterdir() if p.is_dir()]) + runs: List[Path] = [] + seen = set() + for child in sorted([p for p in root.iterdir() if p.is_dir()]): + if _is_internal_trend_release_dir(child): + continue + resolved = child.resolve() + if resolved not in seen and not _looks_like_release_container(child): + runs.append(child) + seen.add(resolved) + for release_child_name in RELEASE_ROLE_DIRS: + release_child = child / release_child_name + if release_child.is_dir() and _looks_like_analysis_run(release_child): + release_resolved = release_child.resolve() + if release_resolved not in seen: + runs.append(release_child) + seen.add(release_resolved) + return sorted(runs, key=get_run_display_name) def count_tlr_scenarios(path: Path) -> int: @@ -174,7 +287,7 @@ def get_run_info(run_path: Path) -> dict: has_score = (run_path / "Score.csv").exists() has_parquet = any(run_path.glob("*.parquet")) return { - "name": run_path.name, + "name": get_run_display_name(run_path), "path": run_path, "size_bytes": size_bytes, "mtime": mtime, @@ -186,23 +299,29 @@ def get_run_info(run_path: Path) -> dict: def resolve_run_subdirectory(run_name: str) -> Tuple[Optional[Path], str]: """ - Resolve a run directory by name (must be a direct child of data root). + Resolve a run directory by display name under the data root. Returns (path, "") on success, or (None, error_message). """ root = get_data_root() if not run_name or run_name.strip() != run_name: return None, "Invalid run name." - if os.sep in run_name or "/" in run_name or ".." in run_name: + if "\x00" in run_name or "\\" in run_name: return None, "Invalid run name." - run_path = root / run_name - if not run_path.exists(): - return None, f"Run does not exist: {run_name}" - if not run_path.is_dir(): - return None, "Not a directory." + display_matches = [path for path in list_run_directories() if get_run_display_name(path) == run_name] + if display_matches: + return display_matches[0], "" + + run_path = (root / run_name).resolve() try: run_path.relative_to(root) except ValueError: return None, "Run is not under data root." + if run_path == root: + return None, "Invalid run name." + if not run_path.exists(): + return None, f"Run does not exist: {run_name}" + if not run_path.is_dir(): + return None, "Not a directory." return run_path, "" diff --git a/evaluation_dashboard_app/lib/perception_catalog_io.py b/evaluation_dashboard_app/lib/perception_catalog_io.py index 2b954af..3a59504 100644 --- a/evaluation_dashboard_app/lib/perception_catalog_io.py +++ b/evaluation_dashboard_app/lib/perception_catalog_io.py @@ -337,38 +337,62 @@ def build_scene_dataframe_from_pkl_dir( total = len(pkl_files) df = SceneDataFrame(current=pd.DataFrame()) + + def _report_progress(done: int) -> None: + if on_progress: + on_progress(done, total) + for i, pkl_file in enumerate(pkl_files): - if str(pkl_file).lower().endswith(".pkl.z"): - try: - data = joblib.load(pkl_file) - except NameError: - raise ImportError("joblib is required for .pkl.z: pip install joblib") - else: - with open(pkl_file, "rb") as f: - data = pickle.load(f) + try: + if str(pkl_file).lower().endswith(".pkl.z"): + try: + data = joblib.load(pkl_file) + except NameError: + raise ImportError("joblib is required for .pkl.z: pip install joblib") + else: + with open(pkl_file, "rb") as f: + data = pickle.load(f) + except Exception as e: + if on_skip: + on_skip(pkl_file, f"failed to load: {e}") + _report_progress(i + 1) + continue + raise data = _normalize_loaded_pkl( data, pkl_file=pkl_file, project_id=project_id, job_id=job_id, ) - df_ = _scenarios_to_df_local(data, scenario_parser_function=scene2df, debug=False) + try: + df_ = _scenarios_to_df_local(data, scenario_parser_function=scene2df, debug=False) + except Exception as e: + if on_skip: + on_skip(pkl_file, f"failed to convert: {e}") + _report_progress(i + 1) + continue + raise del data if df_.empty(): if skip_empty: if on_skip: on_skip(pkl_file, "empty") + del df_ + gc.collect() + _report_progress(i + 1) continue if skip_bad_dtype and hasattr(df_, "current") and "x_error" in getattr(df_.current, "columns", []): if df_.current["x_error"].dtype != "float64": if on_skip: on_skip(pkl_file, f"bad dtype x_error={df_.current['x_error'].dtype}") + del df_ + gc.collect() + _report_progress(i + 1) continue df = df.concatenate(df_) del df_ gc.collect() - if on_progress: - on_progress(i + 1, total) + _report_progress(i + 1) return df diff --git a/evaluation_dashboard_app/lib/perception_eval_result_summarizer.py b/evaluation_dashboard_app/lib/perception_eval_result_summarizer.py index 770fd43..6083498 100644 --- a/evaluation_dashboard_app/lib/perception_eval_result_summarizer.py +++ b/evaluation_dashboard_app/lib/perception_eval_result_summarizer.py @@ -498,86 +498,146 @@ def calc_score_single(df, result_directory): return {} found_gt, pos, prev_frame, uuid_list, obj_idx = False, [], -1, [], 0 res, obj_group, criteria_max_dist = get_option_and_object_group(result_directory) + + frame_data = {} for i in range(total_row_num): - if ( - isnull(df.loc[(i, "ground_truth"), "timestamp"]) - # or df.loc[(i, "ground_truth"), "frame"] == prev_frame - ): - continue + frame_num = df.loc[(i, "estimation"), "frame"] + if isnull(frame_num): + frame_num = df.loc[(i, "ground_truth"), "frame"] - if df.loc[(i, "ground_truth"), "frame"] == prev_frame: - obj_idx += 1 - else: - obj_idx = 0 - - prev_frame = df.loc[(i, "ground_truth"), "frame"] - act_x = df.loc[(i, "ground_truth"), "x"] - act_y = df.loc[(i, "ground_truth"), "y"] - act_dist = math.sqrt(act_x**2 + act_y**2) - act_vx = df.loc[(i, "ground_truth"), "vx"] - act_vy = df.loc[(i, "ground_truth"), "vy"] - # act_vel = math.sqrt(act_vx**2 + act_vy**2) - point = {"x": -act_y, "y": act_x, "dist": act_dist, "vx": -act_vx, "vy": act_vy} - - if act_dist < criteria_max_dist[0]: - key = "criteria0" - dist_err_torelance = 2 - elif act_dist < criteria_max_dist[1]: - key = "criteria1" - dist_err_torelance = 3 - elif act_dist < criteria_max_dist[2]: - key = "criteria2" - dist_err_torelance = 5 - elif act_dist < criteria_max_dist[3]: - key = "criteria3" - dist_err_torelance = 5 - else: - raise ValueError("act_dist is out of range") - - act_label = df.loc[(i, "ground_truth"), "label"] - if not found_gt: - found_gt = True - res["criteria0"]["GT_OBJ"] = df.loc[(i, "ground_truth"), "label"] - res["criteria1"]["GT_OBJ"] = df.loc[(i, "ground_truth"), "label"] - res["criteria2"]["GT_OBJ"] = df.loc[(i, "ground_truth"), "label"] + if frame_num not in frame_data: + frame_data[frame_num] = {"ground_truth": [], "estimation": []} if not isnull(df.loc[(i, "estimation"), "timestamp"]): - est_label = df.loc[(i, "estimation"), "label"] - if act_label != "false_positive": - est_x = df.loc[(i, "estimation"), "x"] - est_y = df.loc[(i, "estimation"), "y"] - diff_dist = math.sqrt((act_x - est_x) ** 2 + (act_y - est_y) ** 2) - est_uuid = df.loc[(i, "estimation"), "uuid"] - if est_uuid not in uuid_list: - uuid_list.append(est_uuid) - # print("param:", df.loc[(i, "estimation"), "timestamp"], act_x, act_y, act_label, est_x, est_y, est_label, diff_dist) + frame_data[frame_num]["estimation"].append( + { + "index": i, + "x": df.loc[(i, "estimation"), "x"], + "y": df.loc[(i, "estimation"), "y"], + "label": df.loc[(i, "estimation"), "label"], + "uuid": df.loc[(i, "estimation"), "uuid"], + "timestamp": df.loc[(i, "estimation"), "timestamp"], + } + ) - if act_label == est_label: - if diff_dist < dist_err_torelance: - status = "TP/TN" + if not isnull(df.loc[(i, "ground_truth"), "timestamp"]): + frame_data[frame_num]["ground_truth"].append( + { + "index": i, + "x": df.loc[(i, "ground_truth"), "x"], + "y": df.loc[(i, "ground_truth"), "y"], + "label": df.loc[(i, "ground_truth"), "label"], + "vx": df.loc[(i, "ground_truth"), "vx"], + "vy": df.loc[(i, "ground_truth"), "vy"], + "frame": df.loc[(i, "ground_truth"), "frame"], + } + ) + + for frame_num in sorted(frame_data.keys()): + frame_gt_list = frame_data[frame_num]["ground_truth"] + frame_est_list = frame_data[frame_num]["estimation"] + + for gt_obj in frame_gt_list: + i = gt_obj["index"] + act_uuid = df.loc[(i, "ground_truth"), "uuid"] + est_uuid = "" + prev_frame = df.loc[(i, "ground_truth"), "frame"] + + act_x = gt_obj["x"] + act_y = gt_obj["y"] + act_dist = math.sqrt(act_x**2 + act_y**2) + act_vx = gt_obj["vx"] + act_vy = gt_obj["vy"] + point = {"x": -act_y, "y": act_x, "dist": act_dist, "vx": -act_vx, "vy": act_vy} + + if act_dist < criteria_max_dist[0]: + key = "criteria0" + dist_err_torelance = 2 + elif act_dist < criteria_max_dist[1]: + key = "criteria1" + dist_err_torelance = 3 + elif act_dist < criteria_max_dist[2]: + key = "criteria2" + dist_err_torelance = 5 + elif act_dist < criteria_max_dist[3]: + key = "criteria3" + dist_err_torelance = 5 + else: + raise ValueError("act_dist is out of range") + + act_label = gt_obj["label"] + if not found_gt: + found_gt = True + res["criteria0"]["GT_OBJ"] = act_label + res["criteria1"]["GT_OBJ"] = act_label + res["criteria2"]["GT_OBJ"] = act_label + + if not isnull(df.loc[(i, "estimation"), "timestamp"]): + est_label = df.loc[(i, "estimation"), "label"] + est_uuid = df.loc[(i, "estimation"), "uuid"] + if act_label != "false_positive": + est_x = df.loc[(i, "estimation"), "x"] + est_y = df.loc[(i, "estimation"), "y"] + diff_dist = math.sqrt((act_x - est_x) ** 2 + (act_y - est_y) ** 2) + + if est_uuid not in uuid_list: + uuid_list.append(est_uuid) + + if act_label == est_label: + if diff_dist < dist_err_torelance: + status = "TP/TN" + else: + status = "ADD" + elif est_label in obj_group[act_label]: + status = "AIL" else: - status = "ADD" - elif est_label in obj_group[act_label]: - status = "AIL" + status = "UIL" else: - status = "UIL" - else: - status = "PFN/PFP" - res[key]["OBJ_CNTS"].setdefault(est_label, 0) - res[key]["OBJ_CNTS"][est_label] += 1 - else: - if act_label != "false_positive": - status = "PFN/PFP" + status = "PFN/PFP" + res[key]["OBJ_CNTS"].setdefault(est_label, 0) + res[key]["OBJ_CNTS"][est_label] += 1 else: - status = "TP/TN" - res[key][status] += 1 - res[key]["NM"] += 1 - res[key]["UUID_NUM"] = len(uuid_list) - point["status"] = status - point["uuid_num"] = len(uuid_list) - if obj_idx == len(pos): - pos.append([]) - pos[obj_idx].append(point) + if act_label != "false_positive": + closest_dist = float("inf") + closest_est = None + + for est_obj in frame_est_list: + diff_dist = math.sqrt((act_x - est_obj["x"]) ** 2 + (act_y - est_obj["y"]) ** 2) + if diff_dist < closest_dist: + closest_dist = diff_dist + closest_est = est_obj + + if closest_est is not None and closest_dist < 1.0: + est_label = closest_est["label"] + est_uuid = closest_est["uuid"] + + if est_uuid is not None and est_uuid not in uuid_list: + uuid_list.append(est_uuid) + + if act_label == est_label: + if closest_dist < dist_err_torelance: + status = "TP/TN" + else: + status = "ADD" + elif est_label in obj_group[act_label]: + status = "AIL" + else: + status = "UIL" + + res[key]["OBJ_CNTS"].setdefault(est_label, 0) + res[key]["OBJ_CNTS"][est_label] += 1 + else: + status = "PFN/PFP" + else: + status = "TP/TN" + + res[key][status] += 1 + res[key]["NM"] += 1 + res[key]["UUID_NUM"] = len(uuid_list) + point["status"] = status + point["act_uuid"] = act_uuid + point["est_uuid"] = est_uuid + pos.append(point) with open(result_directory + "score.json", "w") as file: file.write(json.dumps(res, indent=4)) diff --git a/evaluation_dashboard_app/lib/prediction_eval.py b/evaluation_dashboard_app/lib/prediction_eval.py new file mode 100644 index 0000000..b8f3e4e --- /dev/null +++ b/evaluation_dashboard_app/lib/prediction_eval.py @@ -0,0 +1,555 @@ +from __future__ import annotations + +from typing import Callable, Iterable, Sequence + +import numpy as np +import pandas as pd + + +DISTANCE_BIN_LABELS: list[str] = [ + "0-20 m", + "20-40 m", + "40-60 m", + "60-80 m", + "80-100 m", + "100-120 m", + "120-140 m", + "140-160 m", + "160-180 m", + "180-200 m", + "200+ m", +] + + +def actor_bucket(label: str | None) -> str: + value = str(label or "").strip().lower() + if value in {"car", "truck", "bus", "trailer"}: + return "vehicle" + if value == "pedestrian": + return "pedestrian" + if value in {"bicycle", "motorbike", "motorcycle"}: + return "bicycle" + return "other" + + +def infer_scenario_context(name: str | None) -> str: + text = str(name or "").strip().lower().replace("_", " ").replace("-", " ") + if any(token in text for token in ("crosswalk", "crossing", "jaywalk")): + return "crossing" + if any(token in text for token in ("merge", "ramp")): + return "merge" + if any(token in text for token in ("same lane", "follow", "following")): + return "same-lane" + if any(token in text for token in ("left turn", "right turn", "uturn", "u turn", "turn")): + return "turning" + if any(token in text for token in ("cut in", "cutin", "lane change", "overtake")): + return "cut-in" + return "other" + + +def _metric_label(prefix: str, checkpoint: float | int) -> str: + if float(checkpoint).is_integer(): + checkpoint = int(checkpoint) + return f"{prefix}@{checkpoint}s" + + +def _distance_bin(value: float | int | None) -> str | pd.NA: + if value is None or pd.isna(value): + return pd.NA + edges = list(range(0, 201, 20)) + for start, end, label in zip(edges[:-1], edges[1:], DISTANCE_BIN_LABELS[:-1]): + if start <= float(value) < end: + return label + return DISTANCE_BIN_LABELS[-1] + + +def _ensure_numeric(df: pd.DataFrame, columns: Iterable[str]) -> pd.DataFrame: + out = df.copy() + for col in columns: + if col in out.columns: + out[col] = pd.to_numeric(out[col], errors="coerce") + return out + + +def _noop_progress(_: float, __: str) -> None: + return None + + +def _parse_r_upper_bound(label: object) -> float: + text = str(label) + try: + return float(text.split("-")[-1]) + except ValueError: + return float("inf") + + +def prepare_future_matched_df( + future_df: pd.DataFrame, + *, + time_step: float = 1.0, + coord_abs_limit: float = 1e6, + max_error_m: float = 200.0, +) -> pd.DataFrame: + df = future_df.copy() + df = _ensure_numeric(df, ("frame_index", "relative_time", "x", "y", "tx", "ty", "mode")) + df["frame_index_num"] = df["frame_index"] + df["aligned_horizon_sec"] = (df["relative_time"] / max(time_step, 1e-6)).round() * time_step + if {"x", "y"}.issubset(df.columns): + df["start_distance_m"] = np.sqrt(df["x"].pow(2) + df["y"].pow(2)) + else: + df["start_distance_m"] = np.sqrt(df["tx"].pow(2) + df["ty"].pow(2)) + + key_cols = ["suite_name", "scenario_name", "frame_index_num"] + + gt = df[df["source"].astype(str).str.upper() == "GT"].copy() + est = df[df["source"].astype(str).str.upper() == "EST"].copy() + if "confidence" not in est.columns: + est["confidence"] = np.nan + + gt_start = ( + gt.sort_values("relative_time") + .groupby(key_cols + ["uuid"], dropna=False) + .agg(start_distance_m=("start_distance_m", "first")) + .reset_index() + .rename(columns={"uuid": "uuid_gt"}) + ) + + gt_h = ( + gt[key_cols + ["uuid", "aligned_horizon_sec", "tx", "ty", "label"]] + .dropna(subset=["aligned_horizon_sec"]) + .drop_duplicates(key_cols + ["uuid", "aligned_horizon_sec"]) + .rename( + columns={ + "uuid": "uuid_gt", + "tx": "tx_f_gt", + "ty": "ty_f_gt", + "label": "label_gt", + } + ) + ) + est_h = ( + est[key_cols + ["uuid", "pair_uuid", "mode", "aligned_horizon_sec", "tx", "ty", "confidence"]] + .dropna(subset=["aligned_horizon_sec"]) + .drop_duplicates(key_cols + ["uuid", "pair_uuid", "mode", "aligned_horizon_sec"]) + .rename( + columns={ + "uuid": "uuid_est", + "pair_uuid": "uuid_gt", + "tx": "tx_f_est", + "ty": "ty_f_est", + "confidence": "confidence_est", + } + ) + ) + + matched = est_h.merge(gt_h, on=key_cols + ["uuid_gt", "aligned_horizon_sec"], how="inner") + if matched.empty: + return matched + + matched = matched.merge(gt_start, on=key_cols + ["uuid_gt"], how="left") + matched["track_key"] = ( + matched["scenario_name"].astype("string").fillna("") + + "::" + + matched["frame_index_num"].fillna(-1).astype(int).astype(str) + + "::" + + matched["uuid_est"].astype("string").fillna("") + ) + matched["disp_error_m"] = np.sqrt( + (matched["tx_f_est"] - matched["tx_f_gt"]).pow(2) + (matched["ty_f_est"] - matched["ty_f_gt"]).pow(2) + ) + matched["is_coordinate_outlier"] = ( + matched[["tx_f_est", "ty_f_est", "tx_f_gt", "ty_f_gt"]].abs().gt(coord_abs_limit).any(axis=1) + ) + matched["is_metric_outlier"] = matched["is_coordinate_outlier"] | matched["disp_error_m"].gt(max_error_m) + matched["actor_bucket"] = matched["label_gt"].map(actor_bucket) + matched["scenario_context"] = matched["scenario_name"].map(infer_scenario_context) + return matched + + +def build_future_mode_track_summary( + future_df: pd.DataFrame, + *, + checkpoints: Sequence[float] = (1.0, 2.0, 3.0), + time_step: float = 1.0, + coord_abs_limit: float = 1e6, + max_error_m: float = 200.0, +) -> pd.DataFrame: + matched = prepare_future_matched_df( + future_df, + time_step=time_step, + coord_abs_limit=coord_abs_limit, + max_error_m=max_error_m, + ) + return build_future_mode_track_summary_from_matched(matched, checkpoints=checkpoints) + + +def build_future_mode_track_summary_from_matched( + matched: pd.DataFrame, + *, + checkpoints: Sequence[float] = (1.0, 2.0, 3.0), +) -> pd.DataFrame: + if matched.empty: + return pd.DataFrame( + columns=[ + "track_key", + "suite_name", + "scenario_name", + "frame_index_num", + "uuid_gt", + "uuid_est", + "label_gt", + "mode_count", + "start_distance_m", + ] + ) + + sane = matched[~matched["is_metric_outlier"]].copy() + if sane.empty: + sane = matched.copy() + + group_cols = ["track_key", "suite_name", "scenario_name", "frame_index_num", "uuid_gt", "uuid_est", "label_gt"] + track_summary = ( + sane.groupby(group_cols, dropna=False) + .agg( + mode_count=("mode", "nunique"), + start_distance_m=("start_distance_m", "first"), + confidence_mean=("confidence_est", "mean"), + horizon_max_sec=("aligned_horizon_sec", "max"), + ) + .reset_index() + ) + + for checkpoint in checkpoints: + upto = sane[sane["aligned_horizon_sec"] <= checkpoint].copy() + if upto.empty: + track_summary[_metric_label("minADE", checkpoint)] = np.nan + track_summary[_metric_label("minFDE", checkpoint)] = np.nan + continue + ade_mode = ( + upto.groupby(group_cols + ["mode"], dropna=False)["disp_error_m"] + .mean() + .reset_index(name="ade_m") + ) + fde_mode = ( + upto.sort_values("aligned_horizon_sec") + .groupby(group_cols + ["mode"], dropna=False) + .tail(1)[group_cols + ["mode", "disp_error_m"]] + .rename(columns={"disp_error_m": "fde_m"}) + ) + best_ade = ade_mode.groupby(group_cols, dropna=False)["ade_m"].min().reset_index() + best_fde = fde_mode.groupby(group_cols, dropna=False)["fde_m"].min().reset_index() + track_summary = track_summary.merge( + best_ade.rename(columns={"ade_m": _metric_label("minADE", checkpoint)}), + on=group_cols, + how="left", + ).merge( + best_fde.rename(columns={"fde_m": _metric_label("minFDE", checkpoint)}), + on=group_cols, + how="left", + ) + + track_summary["actor_bucket"] = track_summary["label_gt"].map(actor_bucket) + track_summary["scenario_context"] = track_summary["scenario_name"].map(infer_scenario_context) + track_summary["distance_bin"] = pd.Categorical( + track_summary["start_distance_m"].map(_distance_bin), + categories=DISTANCE_BIN_LABELS, + ordered=True, + ) + return track_summary + + +def build_best_mode_horizon_summary(matched_df: pd.DataFrame) -> pd.DataFrame: + if matched_df.empty: + return pd.DataFrame( + columns=[ + "track_key", + "aligned_horizon_sec", + "disp_error_m", + "scenario_name", + "actor_bucket", + "scenario_context", + ] + ) + sane = matched_df[~matched_df["is_metric_outlier"]].copy() + if sane.empty: + sane = matched_df.copy() + idx = sane.groupby(["track_key", "aligned_horizon_sec"], dropna=False)["disp_error_m"].idxmin() + out = sane.loc[idx, ["track_key", "aligned_horizon_sec", "disp_error_m", "scenario_name", "actor_bucket", "scenario_context"]].copy() + out = out.sort_values(["track_key", "aligned_horizon_sec"]).reset_index(drop=True) + return out + + +def build_future_mode_label_summary( + track_summary: pd.DataFrame, + *, + checkpoints: Sequence[float] = (1.0, 2.0, 3.0), +) -> pd.DataFrame: + if track_summary.empty: + return pd.DataFrame(columns=["Actor", "track_count", "mode_count_mean"]) + agg_map: dict[str, tuple[str, str]] = { + "track_count": ("track_key", "nunique"), + "mode_count_mean": ("mode_count", "mean"), + } + for checkpoint in checkpoints: + agg_map[_metric_label("minADE", checkpoint)] = (_metric_label("minADE", checkpoint), "mean") + agg_map[_metric_label("minFDE", checkpoint)] = (_metric_label("minFDE", checkpoint), "mean") + out = ( + track_summary.groupby("label_gt", dropna=False) + .agg(**agg_map) + .reset_index() + .rename(columns={"label_gt": "Actor"}) + .sort_values("track_count", ascending=False) + ) + return out + + +def build_horizon_breakdown( + matched_df: pd.DataFrame, + *, + checkpoints: Sequence[float] | None = None, +) -> pd.DataFrame: + if matched_df.empty: + return pd.DataFrame(columns=["metric", "value_m"]) + data = _ensure_numeric(matched_df, ("aligned_horizon_sec", "disp_error_m")) + if checkpoints is None: + checkpoints = tuple(sorted(x for x in data["aligned_horizon_sec"].dropna().unique() if x > 0)) + rows: list[dict[str, float | str]] = [] + for checkpoint in checkpoints: + upto = data[data["aligned_horizon_sec"] <= checkpoint] + if upto.empty: + continue + per_track = upto.groupby("track_key", dropna=False)["disp_error_m"].mean() + rows.append({"metric": _metric_label("ADE", checkpoint), "value_m": float(per_track.mean())}) + final = ( + data.sort_values("aligned_horizon_sec") + .groupby("track_key", dropna=False) + .tail(1)["disp_error_m"] + ) + rows.append({"metric": "FDE@final", "value_m": float(final.mean())}) + return pd.DataFrame(rows) + + +def enrich_track_summary( + track_df: pd.DataFrame, + matched_df: pd.DataFrame, + current_df: pd.DataFrame | None = None, +) -> pd.DataFrame: + enriched = track_df.copy() + current_lookup = None + if current_df is not None and not current_df.empty: + current = current_df.copy() + current = _ensure_numeric(current, ("frame_index", "frame_index_num", "center_distance", "center_distance_f")) + if "frame_index_num" not in current.columns: + current["frame_index_num"] = current["frame_index"] + if "uuid_gt" not in current.columns and "uuid" in current.columns: + current["uuid_gt"] = current["uuid"] + distance_col = "center_distance_f" if "center_distance_f" in current.columns else "center_distance" + if distance_col in current.columns: + current_lookup = current.rename(columns={distance_col: "current_distance_m"})[ + ["scenario_name", "frame_index_num", "uuid_gt", "current_distance_m"] + ].drop_duplicates() + + matched_lookup = None + if not matched_df.empty: + matched = matched_df.copy() + matched = _ensure_numeric(matched, ("frame_index_num", "tx_f_gt", "ty_f_gt", "start_distance_m")) + if "start_distance_m" not in matched.columns and {"tx_f_gt", "ty_f_gt"}.issubset(matched.columns): + matched["start_distance_m"] = np.sqrt(matched["tx_f_gt"].pow(2) + matched["ty_f_gt"].pow(2)) + cols = ["track_key", "start_distance_m"] + if {"scenario_name", "frame_index_num", "uuid_gt"}.issubset(matched.columns): + cols += ["scenario_name", "frame_index_num", "uuid_gt"] + matched_lookup = matched[cols].drop_duplicates() + + if current_lookup is not None and {"scenario_name", "frame_index_num", "uuid_gt"}.issubset(enriched.columns): + enriched = enriched.merge(current_lookup, on=["scenario_name", "frame_index_num", "uuid_gt"], how="left") + else: + enriched["current_distance_m"] = np.nan + + if matched_lookup is not None: + join_cols = ["track_key"] if "track_key" in enriched.columns and "track_key" in matched_lookup.columns else [] + if not join_cols and {"scenario_name", "frame_index_num", "uuid_gt"}.issubset(enriched.columns) and {"scenario_name", "frame_index_num", "uuid_gt"}.issubset(matched_lookup.columns): + join_cols = ["scenario_name", "frame_index_num", "uuid_gt"] + if join_cols: + enriched = enriched.merge( + matched_lookup[join_cols + ["start_distance_m"]].drop_duplicates(), + on=join_cols, + how="left", + ) + else: + enriched["start_distance_m"] = np.nan + elif "start_distance_m" not in enriched.columns: + enriched["start_distance_m"] = np.nan + + if "start_distance_m_x" in enriched.columns: + enriched["start_distance_m"] = enriched["current_distance_m"].combine_first(enriched["start_distance_m_x"]) + if "start_distance_m_y" in enriched.columns: + enriched["start_distance_m"] = enriched["start_distance_m"].combine_first(enriched["start_distance_m_y"]) + enriched = enriched.drop(columns=[c for c in ("start_distance_m_x", "start_distance_m_y") if c in enriched.columns]) + else: + enriched["start_distance_m"] = enriched["current_distance_m"].combine_first(enriched["start_distance_m"]) + + label_col = "label_gt" if "label_gt" in enriched.columns else "label" + enriched["actor_bucket"] = enriched[label_col].map(actor_bucket) + enriched["scenario_context"] = enriched["scenario_name"].map(infer_scenario_context) + enriched["distance_bin"] = pd.Categorical( + enriched["start_distance_m"].map(_distance_bin), + categories=DISTANCE_BIN_LABELS, + ordered=True, + ) + return enriched + + +def build_distance_bin_metrics(track_df: pd.DataFrame) -> pd.DataFrame: + data = track_df.copy() + if "distance_bin" in data.columns: + data["distance_bin"] = pd.Categorical(data["distance_bin"], categories=DISTANCE_BIN_LABELS, ordered=True) + else: + data["distance_bin"] = pd.Categorical(data["start_distance_m"].map(_distance_bin), categories=DISTANCE_BIN_LABELS, ordered=True) + + grouped = ( + data.groupby("distance_bin", observed=False) + .agg( + count=("track_key", "nunique"), + ade_m=("ade_m", "mean"), + fde_m=("fde_m", "mean"), + p90_fde_m=("fde_m", lambda s: s.quantile(0.90) if len(s.dropna()) else np.nan), + p95_fde_m=("fde_m", lambda s: s.quantile(0.95) if len(s.dropna()) else np.nan), + ) + .reset_index() + ) + return grouped + + +def build_specsheet_aligned_prediction_artifacts( + future_df: pd.DataFrame, + *, + checkpoints: Sequence[float] = (1.0, 3.0, 5.0), + time_step: float = 0.1, + max_error_m: float = 100.0, + progress_callback: Callable[[float, str], None] | None = None, +) -> dict[str, pd.DataFrame]: + from perception_catalog_analyzer.specsheet.blocks import bin_polar + from perception_catalog_analyzer.specsheet.metrics import load_metrics + from perception_catalog_analyzer.specsheet.metrics.functional import FUTURE_ARRAY_CACHE + + report = progress_callback or _noop_progress + FUTURE_ARRAY_CACHE.clear() + metric_order = [_metric_label(prefix, checkpoint) for prefix in ("minADE", "minFDE") for checkpoint in checkpoints] + metric_map = {metric.name: metric for metric in load_metrics(metric_order)} + + report(0.02, "Binning rows in the same polar grid used by the specsheet...") + normalized_future = _ensure_numeric( + future_df, + ("frame_index", "relative_time", "x", "y", "tx", "ty", "mode", "confidence"), + ) + required_future_cols = ["source", "label", "uuid", "pair_uuid", "frame_index", "relative_time", "tx", "ty"] + present_required_cols = [col for col in required_future_cols if col in normalized_future.columns] + if present_required_cols: + normalized_future = normalized_future.dropna(subset=present_required_cols) + normalized_future = normalized_future.sort_values( + [col for col in ["label", "frame_index", "pair_uuid", "uuid", "mode", "relative_time"] if col in normalized_future.columns], + kind="stable", + ).reset_index(drop=True) + + binned_future = bin_polar(normalized_future.copy()) + if binned_future.empty: + report(0.9, "No future rows were available after binning.") + empty = pd.DataFrame() + return { + "label_summary": empty, + "distance_summary": empty, + "polar_summary": empty, + } + + labels = sorted(str(v) for v in binned_future["label"].dropna().unique() if str(v).strip()) + total_labels = max(len(labels), 1) + total_metrics = max(len(metric_order), 1) + + label_rows: list[dict[str, object]] = [] + distance_rows: list[dict[str, object]] = [] + polar_rows: list[dict[str, object]] = [] + + report(0.28, f"Found {len(labels)} labels to aggregate.") + for label_idx, label_name in enumerate(labels, start=1): + label_start = 0.3 + (0.52 * (label_idx - 1) / total_labels) + label_end = 0.3 + (0.52 * label_idx / total_labels) + report(label_start, f"Aggregating label `{label_name}` ({label_idx}/{total_labels})...") + scoped = binned_future[binned_future["label"].astype(str) == label_name].copy() + est_scoped = scoped[scoped["source"].astype(str).str.upper() == "EST"].copy() + + label_groups = list(scoped.groupby(["r", "theta"], observed=True)) + total_groups = max(len(label_groups), 1) + for group_idx, ((r_name, theta_name), sub_df) in enumerate(label_groups, start=1): + warmup_progress = label_start + ((label_end - label_start) * 0.35 * group_idx / total_groups) + report( + warmup_progress, + f"Preparing label `{label_name}` ({label_idx}/{total_labels}) future arrays: bin `{r_name}` / `{theta_name}` ({group_idx}/{total_groups})...", + ) + for metric_name in metric_order: + metric = metric_map[metric_name] + metric.apply(sub_df) + + row: dict[str, object] = { + "label": label_name, + "future_rows": int(est_scoped[["scenario_name", "frame_index", "uuid"]].drop_duplicates().shape[0]) + if {"scenario_name", "frame_index", "uuid"}.issubset(est_scoped.columns) + else int(len(est_scoped)), + } + for metric_idx, metric_name in enumerate(metric_order, start=1): + metric_progress = label_start + ((label_end - label_start) * (0.35 + (0.65 * metric_idx / total_metrics))) + report( + metric_progress, + f"Aggregating label `{label_name}` ({label_idx}/{total_labels}), metric `{metric_name}` ({metric_idx}/{total_metrics})...", + ) + metric = metric_map[metric_name] + metric_df = metric.apply(scoped) + each_bin_df = metric.get_each_bin(metric_df) + around_df = metric.get_all_around(scoped).dropna(subset=[metric_name]).copy() + near_mask = around_df["r"].map(_parse_r_upper_bound) <= 60.0 + near_values = around_df.loc[near_mask, metric_name].dropna() + row[metric_name] = float(np.nanmean(near_values.to_numpy(dtype=float))) if not near_values.empty else None + + if not around_df.empty: + for rec in around_df[["r", metric_name]].to_dict("records"): + distance_rows.append( + { + "label": label_name, + "metric": metric_name, + "r": rec["r"], + "value": rec[metric_name], + } + ) + + polar_df = each_bin_df.dropna(subset=[metric_name]).copy() + if not polar_df.empty: + polar_df["label"] = label_name + polar_df["metric"] = metric_name + polar_df = polar_df.rename(columns={metric_name: "value"}) + polar_rows.extend(polar_df[["label", "metric", "r", "theta", "value"]].to_dict("records")) + + label_rows.append(row) + + label_summary = pd.DataFrame(label_rows) + if not label_summary.empty: + report(0.86, "Finalizing overall summary row...") + total_rows = float(label_summary["future_rows"].sum()) + overall_row: dict[str, object] = { + "label": "All", + "future_rows": int(total_rows), + } + for metric_name in metric_order: + valid = label_summary[["future_rows", metric_name]].dropna() + if valid.empty or float(valid["future_rows"].sum()) <= 0: + overall_row[metric_name] = None + else: + overall_row[metric_name] = float( + (valid["future_rows"] * valid[metric_name]).sum() / valid["future_rows"].sum() + ) + label_summary = pd.concat([pd.DataFrame([overall_row]), label_summary], ignore_index=True) + report(0.9, "Prediction summary tables are ready for cache save.") + + return { + "label_summary": label_summary, + "distance_summary": pd.DataFrame(distance_rows), + "polar_summary": pd.DataFrame(polar_rows), + } diff --git a/evaluation_dashboard_app/lib/release_specsheet_library.py b/evaluation_dashboard_app/lib/release_specsheet_library.py new file mode 100644 index 0000000..ce51f84 --- /dev/null +++ b/evaluation_dashboard_app/lib/release_specsheet_library.py @@ -0,0 +1,165 @@ +from __future__ import annotations + +import urllib.parse +from pathlib import Path +from typing import Any + +import yaml + +from lib.path_utils import get_run_display_name, path_display + + +RELEASE_ROLE_DIRS = ("performance", "usecase", "devops") +DEFAULT_EVALUATOR_PROJECT_ID = "x2_dev" +EVALUATOR_REPORT_BASE_URL = "https://evaluation.tier4.jp/evaluation/reports" + + +def _overview_query(run_path: Path) -> str: + return urllib.parse.urlencode({"mode": "single", "run_a": get_run_display_name(run_path)}) + + +def _safe_url_part(value: str, fallback: str) -> str: + import re + + text = re.sub(r"[^\w.\-]+", "_", str(value or "")).strip("._") + return text or fallback + + +def _load_yaml(path: Path) -> dict[str, Any]: + if not path.exists(): + return {} + try: + data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + except Exception: + return {} + return data if isinstance(data, dict) else {} + + +def _role_metadata(role_dir: Path) -> dict[str, Any]: + metadata = _load_yaml(role_dir / "metadata.yaml") + if metadata: + return metadata + return _load_yaml(role_dir / "resources" / "metadata.yaml") + + +def _evaluator_report_url(job_id: str, project_id: str = DEFAULT_EVALUATOR_PROJECT_ID) -> str: + if not job_id: + return "" + query = urllib.parse.urlencode({"project_id": project_id}) + return f"{EVALUATOR_REPORT_BASE_URL}/{job_id}?{query}" + + +def _pdf_static_url(release_name: str, topic_name: str) -> str: + release_part = _safe_url_part(release_name, "release") + topic_part = _safe_url_part(topic_name, "topic") + return f"/app/static/release_specs/{release_part}/{topic_part}.pdf" + + +def discover_release_specsheet_inventory(data_root: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for release_dir in sorted(data_root.glob("release_spec_*")): + if not release_dir.is_dir(): + continue + metadata_path = release_dir / "metadata.yaml" + metadata = {} + if metadata_path.exists(): + try: + metadata = yaml.safe_load(metadata_path.read_text(encoding="utf-8")) or {} + except Exception: + metadata = {} + if not isinstance(metadata, dict): + metadata = {} + + specsheet_root = release_dir / "specsheet" + topic_pdf_paths = { + path + for path in specsheet_root.glob("*/*.pdf") + if path.is_file() or path.is_symlink() + } + pdfs: list[dict[str, Any]] = [] + for pdf_path in sorted(specsheet_root.glob("**/*.pdf")): + if pdf_path.parent == specsheet_root and topic_pdf_paths: + continue + topic = pdf_path.parent.name if pdf_path.parent != specsheet_root else "default" + static_path = ( + Path.cwd() + / "static" + / "release_specs" + / _safe_url_part(release_dir.name.replace("release_spec_", "", 1), "release") + / f"{_safe_url_part(topic, 'topic')}.pdf" + ) + pdfs.append( + { + "topic": topic, + "path": pdf_path, + "display_path": path_display(pdf_path), + "absolute_path": str(pdf_path.resolve()), + "static_path": static_path, + "static_url": _pdf_static_url(release_dir.name.replace("release_spec_", "", 1), topic), + "available": pdf_path.exists() and not pdf_path.is_dir(), + "static_available": static_path.exists() and not static_path.is_dir(), + } + ) + + roles: dict[str, dict[str, Any]] = {} + for role in RELEASE_ROLE_DIRS: + role_dir = release_dir / role + if not role_dir.is_dir(): + continue + role_metadata = _role_metadata(role_dir) + job_id = str(role_metadata.get("job_id") or "").strip() + project_id = str(role_metadata.get("project_id") or DEFAULT_EVALUATOR_PROJECT_ID).strip() + roles[role] = { + "path": role_dir, + "display_path": path_display(role_dir), + "absolute_path": str(role_dir.resolve()), + "run_name": get_run_display_name(role_dir), + "overview_query": _overview_query(role_dir), + "overview_url": f"/?{_overview_query(role_dir)}", + "job_id": job_id, + "project_id": project_id, + "evaluator_report_url": _evaluator_report_url(job_id, project_id), + "has_parquet": any(role_dir.glob("*.parquet")), + "has_summary": (role_dir / "summary.json").exists() or (role_dir / "resources" / "summary.json").exists(), + "has_metadata": (role_dir / "metadata.yaml").exists() or (role_dir / "resources" / "metadata.yaml").exists(), + } + + rows.append( + { + "release_dir": release_dir, + "release_dir_display": path_display(release_dir), + "release_dir_absolute": str(release_dir.resolve()), + "release": release_dir.name.replace("release_spec_", "", 1), + "version": metadata.get("pilot_auto_version") or metadata.get("version_abbr") or "", + "date": metadata.get("date") or "", + "description": metadata.get("description") or "", + "data_count": metadata.get("data_count") or "", + "roles": roles, + "pdfs": pdfs, + "pdf_topics": ", ".join(pdf["topic"] for pdf in pdfs), + "main_pdf_url": next((pdf["static_url"] for pdf in pdfs), ""), + "main_pdf_path": next((pdf["display_path"] for pdf in pdfs), ""), + } + ) + return rows + + +def discover_ready_release_specsheets(data_root: Path) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for release in discover_release_specsheet_inventory(data_root): + default_run = release["roles"].get("performance") or next(iter(release["roles"].values()), {}) + for pdf in release["pdfs"]: + rows.append( + { + "release_dir": release["release_dir"], + "pdf_path": pdf["path"], + "release": release["release"], + "version": release["version"], + "date": release["date"], + "description": release["description"], + "topic": pdf["topic"], + "view_run": default_run.get("run_name", ""), + "overview_query": default_run.get("overview_query", ""), + } + ) + return rows diff --git a/evaluation_dashboard_app/lib/run_loader.py b/evaluation_dashboard_app/lib/run_loader.py index d240511..fc99168 100644 --- a/evaluation_dashboard_app/lib/run_loader.py +++ b/evaluation_dashboard_app/lib/run_loader.py @@ -1,5 +1,6 @@ from pathlib import Path import pandas as pd +from lib.score_schema import read_score_csv SUMMARY_DTYPES = { "id": "string", @@ -29,17 +30,7 @@ def load_run(run_dir: Path): if not summary_path.exists(): if _has_parquet_files(run_dir): # Parquet-only run: allow load for Detection Stats and Bounding Box Viewer - score = pd.read_csv( - score_path, - header=None, - engine="python", - names=[ - "Scenario", "Option", "GT_OBJ", "Distance0", "NM0", "TP/TN0", "ADD0", "AIL0", "UIL0", "PFN/PFP0", "UUID Num0", "Practical Pass Rate0", "MAX_DIST_THRESH0", "OBJ_CNTS0", - "Distance1", "NM1", "TP/TN1", "ADD1", "AIL1", "UIL1", "PFN/PFP1", "UUID Num1", "Practical Pass Rate1", "MAX_DIST_THRESH1", "OBJ_CNTS1", - "Distance2", "NM2", "TP/TN2", "ADD2", "AIL2", "UIL2", "PFN/PFP2", "UUID Num2", "Practical Pass Rate2", "MAX_DIST_THRESH2", "OBJ_CNTS2", - "Distance3", "NM3", "TP/TN3", "ADD3", "AIL3", "UIL3", "PFN/PFP3", "UUID Num3", "Practical Pass Rate3", "MAX_DIST_THRESH3", "OBJ_CNTS3", - ] - ) if score_path.exists() else None + score = read_score_csv(score_path) return { "path": run_dir, "summary": None, @@ -60,17 +51,7 @@ def load_run(run_dir: Path): if col not in summary.columns: summary[col] = pd.Series([""] * len(summary), dtype="string") - score = pd.read_csv( - score_path, - header=None, - engine="python", - names=[ - "Scenario", "Option", "GT_OBJ", "Distance0", "NM0", "TP/TN0", "ADD0", "AIL0", "UIL0", "PFN/PFP0", "UUID Num0", "Practical Pass Rate0", "MAX_DIST_THRESH0", "OBJ_CNTS0", - "Distance1", "NM1", "TP/TN1", "ADD1", "AIL1", "UIL1", "PFN/PFP1", "UUID Num1", "Practical Pass Rate1", "MAX_DIST_THRESH1", "OBJ_CNTS1", - "Distance2", "NM2", "TP/TN2", "ADD2", "AIL2", "UIL2", "PFN/PFP2", "UUID Num2", "Practical Pass Rate2", "MAX_DIST_THRESH2", "OBJ_CNTS2", - "Distance3", "NM3", "TP/TN3", "ADD3", "AIL3", "UIL3", "PFN/PFP3", "UUID Num3", "Practical Pass Rate3", "MAX_DIST_THRESH3", "OBJ_CNTS3", - ] - ) if score_path.exists() else None + score = read_score_csv(score_path) return { "path": run_dir, diff --git a/evaluation_dashboard_app/lib/run_metadata.py b/evaluation_dashboard_app/lib/run_metadata.py new file mode 100644 index 0000000..f5821b1 --- /dev/null +++ b/evaluation_dashboard_app/lib/run_metadata.py @@ -0,0 +1,391 @@ +"""Helpers for durable per-run metadata stored alongside local run folders.""" + +from __future__ import annotations + +import json +import os +from datetime import datetime, timezone +from pathlib import Path +from tempfile import NamedTemporaryFile +from typing import Any, Dict, Iterable, Optional + +from lib.path_utils import get_data_root, path_display, to_data_relative + +RUN_METADATA_FILENAME = ".run_metadata.json" +RUN_METADATA_SCHEMA_VERSION = 1 + + +def _utc_now_iso() -> str: + return datetime.now(timezone.utc).replace(microsecond=0).isoformat() + + +def _json_safe(value: Any) -> Any: + if isinstance(value, dict): + return {str(key): _json_safe(val) for key, val in value.items()} + if isinstance(value, (list, tuple)): + return [_json_safe(item) for item in value] + if isinstance(value, Path): + return str(value) + if isinstance(value, datetime): + if value.tzinfo is None: + value = value.replace(tzinfo=timezone.utc) + return value.astimezone(timezone.utc).replace(microsecond=0).isoformat() + return value + + +def _deep_merge(base: Dict[str, Any], patch: Dict[str, Any]) -> Dict[str, Any]: + merged = dict(base) + for key, value in patch.items(): + if isinstance(value, dict) and isinstance(merged.get(key), dict): + merged[key] = _deep_merge(merged[key], value) + else: + merged[key] = value + return merged + + +def normalize_run_path(path_like: str | Path, *, allow_missing: bool = True) -> Optional[Path]: + raw = str(path_like or "").strip() + if not raw: + return None + try: + candidate = Path(raw) + if not candidate.is_absolute(): + candidate = get_data_root() / candidate + resolved = candidate.resolve(strict=False) + try: + resolved.relative_to(get_data_root()) + except ValueError: + return None + if not allow_missing and not resolved.exists(): + return None + return resolved + except Exception: + return None + + +def find_run_directory(path_like: str | Path, *, create_missing: bool = False) -> Optional[Path]: + resolved = normalize_run_path(path_like, allow_missing=True) + if resolved is None: + return None + try: + rel = resolved.relative_to(get_data_root()) + except ValueError: + return None + if not rel.parts: + return None + run_dir = get_data_root() / rel.parts[0] + if create_missing: + run_dir.mkdir(parents=True, exist_ok=True) + elif not run_dir.exists(): + return None + return run_dir + + +def resolve_run_directory_from_task_parameters( + parameters: Dict[str, Any], + *, + create_missing: bool = False, +) -> Optional[Path]: + for key in ("output_path", "output_dir", "eval_root", "pkl_dir", "result_path"): + path_value = parameters.get(key) + if not path_value: + continue + run_dir = find_run_directory(path_value, create_missing=create_missing) + if run_dir is not None: + return run_dir + return None + + +def metadata_path_for_run(run_path: Path) -> Path: + return run_path / RUN_METADATA_FILENAME + + +def read_run_metadata(run_path: Path) -> Dict[str, Any]: + meta_path = metadata_path_for_run(run_path) + if not meta_path.exists(): + return {} + try: + with meta_path.open("r", encoding="utf-8") as fh: + payload = json.load(fh) + return payload if isinstance(payload, dict) else {} + except Exception: + return {} + + +def write_run_metadata(run_path: Path, metadata: Dict[str, Any], *, create_missing: bool = False) -> Dict[str, Any]: + if create_missing: + run_path.mkdir(parents=True, exist_ok=True) + elif not run_path.exists(): + raise FileNotFoundError(str(run_path)) + + payload = dict(metadata) + payload["schema_version"] = RUN_METADATA_SCHEMA_VERSION + payload["run_name"] = run_path.name + payload["run_path"] = to_data_relative(run_path) + payload["run_path_display"] = path_display(run_path) + payload["updated_at"] = _utc_now_iso() + payload.setdefault("created_at", payload["updated_at"]) + + meta_path = metadata_path_for_run(run_path) + with NamedTemporaryFile("w", encoding="utf-8", dir=str(run_path), delete=False) as tmp: + json.dump(_json_safe(payload), tmp, ensure_ascii=False, indent=2, sort_keys=True) + tmp.write("\n") + tmp_path = Path(tmp.name) + try: + os.chmod(tmp_path, 0o644) + except Exception: + pass + tmp_path.replace(meta_path) + try: + os.chmod(meta_path, 0o644) + except Exception: + pass + return payload + + +def upsert_run_metadata(run_path: Path, patch: Dict[str, Any], *, create_missing: bool = False) -> Dict[str, Any]: + existing = read_run_metadata(run_path) + merged = _deep_merge(existing, _json_safe(patch)) + if "created_at" not in merged: + merged["created_at"] = _utc_now_iso() + return write_run_metadata(run_path, merged, create_missing=create_missing) + + +def flatten_metadata_text(value: Any) -> Iterable[str]: + if value is None: + return [] + if isinstance(value, dict): + parts = [] + for key, item in value.items(): + parts.append(str(key)) + parts.extend(flatten_metadata_text(item)) + return parts + if isinstance(value, (list, tuple, set)): + parts = [] + for item in value: + parts.extend(flatten_metadata_text(item)) + return parts + text = str(value).strip() + return [text] if text else [] + + +def build_run_search_blob(run_path: Path, metadata: Dict[str, Any], extra_values: Optional[Iterable[Any]] = None) -> str: + parts = [run_path.name, to_data_relative(run_path), path_display(run_path)] + parts.extend(flatten_metadata_text(metadata)) + if extra_values: + for value in extra_values: + parts.extend(flatten_metadata_text(value)) + return " ".join(part for part in parts if part).lower() + + +def _as_dict(value: Any) -> Dict[str, Any]: + if isinstance(value, dict): + return value + if isinstance(value, str): + try: + parsed = json.loads(value) + return parsed if isinstance(parsed, dict) else {} + except Exception: + return {} + return {} + + +def resolve_run_directory_from_task_row(task_row: Dict[str, Any]) -> Optional[Path]: + params = _as_dict(task_row.get("parameters")) + run_dir = resolve_run_directory_from_task_parameters(params, create_missing=False) + if run_dir is not None: + return run_dir + result_path = task_row.get("result_path") + if result_path: + return find_run_directory(result_path, create_missing=False) + summary = _as_dict(task_row.get("result_summary")) + for key in ("output_path", "summary_path", "parquet_path"): + path_value = summary.get(key) + if path_value: + run_dir = find_run_directory(path_value, create_missing=False) + if run_dir is not None: + return run_dir + return None + + +def build_metadata_patch_from_task_row(task_row: Dict[str, Any]) -> Dict[str, Any]: + params = _as_dict(task_row.get("parameters")) + summary = _as_dict(task_row.get("result_summary")) + task_type = str(task_row.get("type") or "").strip() + request_output = str( + params.get("output_path") + or params.get("output_dir") + or params.get("eval_root") + or params.get("pkl_dir") + or task_row.get("result_path") + or "" + ).strip() + + patch: Dict[str, Any] = { + "source_mode": task_type, + "task": { + "id": str(task_row.get("id") or "").strip(), + "type": task_type, + "status": str(task_row.get("status") or "").strip(), + "requested_by": str(task_row.get("session_id") or "").strip(), + "created_at": task_row.get("created_at"), + "updated_at": task_row.get("updated_at"), + "result_path": str(task_row.get("result_path") or "").strip(), + "error_message": str(task_row.get("error_message") or "").strip(), + "progress_message": str(task_row.get("progress_message") or "").strip(), + "progress_pct": task_row.get("progress_pct"), + }, + "request": { + "environment": str(params.get("environment") or "default").strip() or "default", + "project_id": str(params.get("project_id") or "").strip(), + "job_id": str(params.get("job_id") or "").strip(), + "catalog_id": str(params.get("catalog_id") or "").strip(), + "integration_id": str(params.get("integration_id") or "").strip(), + "source_job_id": str(params.get("source_job_id") or "").strip(), + "target_name": str(params.get("target_name") or "").strip(), + "description": str(params.get("description") or "").strip(), + "suite_id": str(params.get("suite_id") or "").strip(), + "suite_ids": list(params.get("suite_ids") or []), + "download_type": str(params.get("download_type") or "").strip(), + "phase": str(params.get("phase") or "").strip(), + "skip_large_file": bool(params.get("skip_large_file", False)), + "large_file_mb": params.get("large_file_mb"), + "keep_zip_files": bool(params.get("keep_zip_files", False)), + "run_eval": bool(params.get("run_eval", False)), + "generate_parquet": bool(params.get("generate_parquet", False)), + "eval_recursive": bool(params.get("eval_recursive", False)), + "eval_overwrite": bool(params.get("eval_overwrite", False)), + "max_retries": params.get("max_retries"), + "clean_build": bool(params.get("clean_build", False)), + "debug": bool(params.get("debug", False)), + "is_tag": bool(params.get("is_tag", False)), + "scenario_name_filter": str(params.get("scenario_name_filter") or "").strip(), + "selected_ids": list(params.get("selected_ids") or []), + "output_path": request_output, + "parameters": params, + }, + "backfilled_from_task_history": True, + } + + if task_type == "download_results": + patch["download"] = { + "mode": "download_results", + "total": summary.get("total", 0), + "success": summary.get("success", 0), + "failed": summary.get("failed", 0), + "rows": list(summary.get("rows") or [])[:100], + "download_type": str(params.get("download_type") or "").strip(), + "phase": str(params.get("phase") or "").strip(), + "skip_large_file": bool(params.get("skip_large_file", False)), + "large_file_mb": params.get("large_file_mb"), + "keep_zip_files": bool(params.get("keep_zip_files", False)), + } + elif task_type == "download_scenarios": + patch["scenario_download"] = { + "total": summary.get("total", 0), + "success": summary.get("success", 0), + "failed": summary.get("failed", 0), + "rows": list(summary.get("rows") or [])[:100], + "overwrite": bool(params.get("overwrite", False)), + "scenario_name_filter": str(params.get("scenario_name_filter") or "").strip(), + "selected_ids": list(params.get("selected_ids") or []), + } + elif task_type == "run_eval_dirs": + patch["evaluation"] = { + "directories_processed": summary.get("directories_processed", 0), + "success": summary.get("success", 0), + "failed": summary.get("failed", 0), + "skipped": summary.get("skipped", 0), + "summary_path": str(summary.get("summary_path") or "").strip(), + "summary_rows": summary.get("summary_rows", 0), + "score_rows": summary.get("score_rows", 0), + "enabled": True, + "recursive": bool(params.get("recursive", True)), + "overwrite": bool(params.get("overwrite", False)), + } + elif task_type == "generate_summary_csv": + patch["evaluation"] = { + "summary_path": str(summary.get("summary_path") or "").strip(), + "summary_rows": summary.get("summary_rows", 0), + "score_rows": summary.get("score_rows", 0), + "enabled": True, + } + elif task_type == "build_parquet": + patch["parquet"] = { + "enabled": True, + "path": str(summary.get("output_path") or "").strip(), + } + elif task_type == "download_and_eval": + patch["download"] = { + "mode": "download_and_eval", + **_as_dict(summary.get("download_summary")), + "download_type": str(params.get("download_type") or "").strip(), + "phase": str(params.get("phase") or "").strip(), + "skip_large_file": bool(params.get("skip_large_file", False)), + "large_file_mb": params.get("large_file_mb"), + "keep_zip_files": bool(params.get("keep_zip_files", False)), + } + patch["evaluation"] = { + **_as_dict(summary.get("eval_summary")), + "enabled": bool(params.get("run_eval", False)), + "recursive": bool(params.get("eval_recursive", False)), + "overwrite": bool(params.get("eval_overwrite", False)), + } + patch["parquet"] = { + "enabled": bool(params.get("generate_parquet", False)), + "path": str(summary.get("parquet_path") or "").strip(), + } + errors = list(summary.get("errors") or []) + if errors: + patch["errors"] = errors + elif task_type == "run_evaluator_and_process": + patch["evaluator"] = { + "job_id": str(summary.get("evaluator_job_id") or params.get("job_id") or "").strip(), + "report_url": str(summary.get("evaluator_report_url") or "").strip(), + "status": str(summary.get("evaluator_status") or "").strip(), + "title": str(summary.get("evaluator_title") or params.get("description") or "").strip(), + "scheduled_by": str(summary.get("evaluator_scheduled_by") or "").strip(), + "build_status": str(summary.get("evaluator_build_status") or "").strip(), + "test_status": str(summary.get("evaluator_test_status") or "").strip(), + "fail_message": str(summary.get("evaluator_fail_message") or "").strip(), + "case_totals": _as_dict(summary.get("evaluator_case_totals")), + "suites": list(summary.get("evaluator_suites") or []), + "failed_cases": list(summary.get("evaluator_failed_cases") or []), + "catalog_id": str(params.get("catalog_id") or "").strip(), + "catalog_name": str(summary.get("evaluator_catalog_name") or "").strip(), + "catalog_version_id": str(summary.get("evaluator_catalog_version_id") or "").strip(), + "catalog_url": str(summary.get("evaluator_catalog_url") or "").strip(), + "integration_id": str(params.get("integration_id") or "").strip(), + "source_job_id": str(params.get("source_job_id") or "").strip(), + "target_name": str(params.get("target_name") or "").strip(), + "target": str(summary.get("evaluator_target") or params.get("target_name") or "").strip(), + "git_sha": str(summary.get("evaluator_git_sha") or "").strip(), + "git_ref_url": str(summary.get("evaluator_git_ref_url") or "").strip(), + "git_commit_url": str(summary.get("evaluator_git_commit_url") or "").strip(), + "source_url": str(summary.get("evaluator_source_url") or "").strip(), + "source_repo_label": str(summary.get("evaluator_source_repo_label") or "").strip(), + "description": str(params.get("description") or "").strip(), + "is_tag": bool(params.get("is_tag", False)), + } + patch["download"] = { + "mode": "run_evaluator_and_process", + **_as_dict(summary.get("download_summary")), + "rows": list(summary.get("download_rows") or [])[:100], + "download_type": str(params.get("download_type") or "").strip(), + "phase": str(params.get("phase") or "").strip(), + "skip_large_file": bool(params.get("skip_large_file", False)), + "large_file_mb": params.get("large_file_mb"), + "keep_zip_files": bool(params.get("keep_zip_files", False)), + } + patch["evaluation"] = { + **_as_dict(summary.get("eval_summary")), + "enabled": bool(params.get("run_eval", False)), + "recursive": bool(params.get("eval_recursive", False)), + "overwrite": bool(params.get("eval_overwrite", False)), + } + patch["parquet"] = { + "enabled": bool(params.get("generate_parquet", False)), + "path": str(summary.get("parquet_path") or "").strip(), + } + + return patch diff --git a/evaluation_dashboard_app/lib/score_schema.py b/evaluation_dashboard_app/lib/score_schema.py new file mode 100644 index 0000000..5aef313 --- /dev/null +++ b/evaluation_dashboard_app/lib/score_schema.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +from pathlib import Path + +import pandas as pd + +SCORE_BASE_COLS = ["Scenario", "Option", "GT_OBJ"] +SCORE_BASE_COLS_WITH_DATASET = ["Scenario", "Dataset", "Option", "GT_OBJ"] + +SCORE_SOURCE_METRIC_COLS = [ + "Distance", + "NM", + "TP/TN", + "ADD", + "AIL", + "UIL", + "PFN/PFP", + "UUID Num", + "Practical Pass Rate", + "MAX_DIST_THRESH", + "OBJ_CNTS", +] + +SCORE_VIEW_METRIC_COLS = [ + "distance", + "nm", + "tp_tn", + "add", + "ail", + "uil", + "pfn_pfp", + "uuid_num", + "pass_rate", + "max_dist_thresh", + "obj_cnts", +] + +SCORE_NUM_COLS = [ + "nm", + "tp_tn", + "add", + "ail", + "uil", + "pfn_pfp", + "uuid_num", + "pass_rate", + "max_dist_thresh", +] + +SCORE_BLOCK_SIZE = len(SCORE_VIEW_METRIC_COLS) + + +def _looks_like_header(row: pd.Series) -> bool: + first = str(row.iloc[0]).strip() if len(row) else "" + return first == "Scenario" + + +def _looks_like_criteria_cell(value: object) -> bool: + text = str(value).strip() + return text.startswith("criteria") + + +def _drop_extra_empty_trailing_columns(df: pd.DataFrame, base_count: int) -> pd.DataFrame: + while ( + df.shape[1] + and df.iloc[:, -1].isna().all() + and (df.shape[1] - base_count) % SCORE_BLOCK_SIZE != 0 + ): + df = df.iloc[:, :-1] + return df + + +def _infer_base_count(df: pd.DataFrame, header_row: pd.Series | None) -> int: + if header_row is not None: + header_values = [str(x).strip() for x in header_row.tolist()] + if len(header_values) >= 4 and header_values[1] == "Dataset": + return 4 + return 3 + + if df.empty: + return 3 + first = df.iloc[0] + if len(first) > 4 and _looks_like_criteria_cell(first.iloc[4]): + return 4 + if len(first) > 3 and _looks_like_criteria_cell(first.iloc[3]): + return 3 + + ncols = df.shape[1] + if ncols >= 4 and (ncols - 4) % SCORE_BLOCK_SIZE == 0: + return 4 + return 3 + + +def score_raw_columns(has_dataset: bool, criteria_count: int) -> list[str]: + cols = list(SCORE_BASE_COLS_WITH_DATASET if has_dataset else SCORE_BASE_COLS) + for i in range(criteria_count): + cols.extend(f"{name}{i}" for name in SCORE_SOURCE_METRIC_COLS) + return cols + + +def read_score_csv(score_path: Path) -> pd.DataFrame | None: + if not score_path.exists(): + return None + + raw = pd.read_csv(score_path, header=None, engine="python") + if raw.empty: + return raw + + header_row = raw.iloc[0] if _looks_like_header(raw.iloc[0]) else None + if header_row is not None: + raw = raw.iloc[1:].reset_index(drop=True) + + base_count = _infer_base_count(raw, header_row) + raw = _drop_extra_empty_trailing_columns(raw, base_count) + criteria_count = max(1, (raw.shape[1] - base_count) // SCORE_BLOCK_SIZE) + expected_cols = base_count + criteria_count * SCORE_BLOCK_SIZE + raw = raw.iloc[:, :expected_cols].copy() + raw.columns = score_raw_columns(base_count == 4, criteria_count) + return raw.reset_index(drop=True) + + +def score_base_cols(df_raw: pd.DataFrame) -> list[str]: + if df_raw is not None and "Dataset" in df_raw.columns: + return list(SCORE_BASE_COLS_WITH_DATASET) + return list(SCORE_BASE_COLS) + + +def infer_score_criteria_count( + df_raw: pd.DataFrame, + max_criteria: int = 32, +) -> int: + if df_raw is None or df_raw.empty: + return 1 + base_count = len(score_base_cols(df_raw)) + n = (df_raw.shape[1] - base_count) // SCORE_BLOCK_SIZE + n = max(1, n) + return int(min(n, max_criteria)) + + +def build_score_view(df_raw: pd.DataFrame, criteria_idx: int) -> pd.DataFrame: + base_cols = score_base_cols(df_raw) + start = len(base_cols) + criteria_idx * SCORE_BLOCK_SIZE + end = start + SCORE_BLOCK_SIZE + + df_view = df_raw.loc[:, base_cols].copy() + block = df_raw.iloc[:, start:end].copy() + block.columns = SCORE_VIEW_METRIC_COLS + df_view = pd.concat([df_view, block], axis=1) + for column in SCORE_NUM_COLS: + df_view[column] = pd.to_numeric(df_view[column], errors="coerce") + return df_view + + +def score_identity_cols(df: pd.DataFrame) -> list[str]: + return ["Scenario", "Dataset"] if df is not None and "Dataset" in df.columns else ["Scenario"] diff --git a/evaluation_dashboard_app/lib/specsheet_report.py b/evaluation_dashboard_app/lib/specsheet_report.py new file mode 100644 index 0000000..992c8b0 --- /dev/null +++ b/evaluation_dashboard_app/lib/specsheet_report.py @@ -0,0 +1,1677 @@ +from __future__ import annotations + +from contextlib import contextmanager +from dataclasses import dataclass +import inspect +import json +import os +import re +import shutil +from types import SimpleNamespace +from pathlib import Path +from typing import Any, Callable, Iterable, Sequence + +import pandas as pd +import yaml + +from lib.path_utils import get_data_root +from lib.run_metadata import read_run_metadata + +DEFAULT_SPECSHEET_TOPIC = "perception.object_recognition.tracking.objects" +DEFAULT_TREND_TOPIC = "perception.object_recognition.objects" +DETECTION_TREND_TOPIC_BY_MODEL = { + "bevfusion": "perception.object_recognition.detection.bevfusion.objects", + "centerpoint": "perception.object_recognition.detection.centerpoint.objects", +} +DEFAULT_SPECSHEET_PROJECT_ID = "x2_dev" +DEFAULT_SPECSHEET_LABELS = ["car", "truck", "bus", "bicycle", "pedestrian", "motorcycle"] +DEFAULT_SPECSHEET_METRICS = [ + "mAP", + "precision", + "recall", + "FNR", + "max_consecutive_fn_duration", + "x_error", + "y_error", + "yaw_error", + "speed_error", +] +FUTURE_SPECSHEET_METRICS = [ + "minADE@1s", + "minADE@3s", + "minADE@5s", + "minFDE@1s", + "minFDE@3s", + "minFDE@5s", +] +TREND_METADATA_FILENAME = "metadata.yaml" +TREND_SUMMARY_FILENAME = "summary.json" +SPECSHEET_RELEASE_ROLE_DIRS = ("performance", "usecase", "devops") +GENERATED_TREND_HISTORY_DIRNAME = "_app_trend_history" +FULL_DATASET_EVALUATION_HEADER = "全数データセット評価" +DEFAULT_TREND_METADATA_TEXT = """tags: [trend] +pilot_auto_version: "Pilot.Auto v4.3.0 (centerpoint x2/2.3.1)" +data_count: 99,776+ +description: データの追加 +date: 2025.11.7 +""" +_TREND_DATE_PATTERN = re.compile(r"^\d{4}\.\d{1,2}\.\d{1,2}$") +_TREND_DATA_COUNT_PATTERN = re.compile(r"^\d[\d,]*\+?$") +_PILOT_AUTO_PREFIX_PATTERN = re.compile(r"^Pilot\.Auto\s+", re.IGNORECASE) + + +@dataclass +class TrendReleaseGroup: + group_key: str + display_name: str + topic_name: str + group_kind: str + base_dir: Path + jobs: dict[str, dict[str, Any]] + + +def get_specsheet_artifact_paths(run_dir: str | Path) -> dict[str, Path]: + run_path = Path(run_dir) + return { + "run_dir": run_path, + "current_csv": run_path / "current.csv", + "future_csv": run_path / "future.csv", + "current_parquet": run_path / "current.parquet", + "future_parquet": run_path / "future.parquet", + "resource_dir": run_path / "resources", + "trend_metadata": run_path / "resources" / TREND_METADATA_FILENAME, + "trend_summary": run_path / "resources" / TREND_SUMMARY_FILENAME, + "specsheet_dir": run_path / "specsheet", + "specsheet_pdf": run_path / "specsheet" / "specsheet.pdf", + } + + +def _topic_values_from_frame(frame: pd.DataFrame) -> list[str]: + for column in ("topic_name", "topic"): + if column not in frame.columns: + continue + values = [ + str(value).strip() + for value in frame[column].dropna().unique().tolist() + if str(value).strip() + ] + if values: + return sorted(values) + return [] + + +def detect_specsheet_topic_names(run_dir: str | Path, *, csv_sample_rows: int = 50000) -> list[str]: + """Detect topic names already present in specsheet CSV/parquet artifacts.""" + paths = get_specsheet_artifact_paths(run_dir) + detected: set[str] = set() + + for parquet_path in (paths["current_parquet"], paths["future_parquet"]): + if not parquet_path.exists(): + continue + try: + import pyarrow.parquet as pq + + columns = set(pq.ParquetFile(parquet_path).schema_arrow.names) + except Exception: + try: + columns = set(pd.read_parquet(parquet_path, columns=[]).columns) + except Exception: + columns = set() + topic_columns = [column for column in ("topic_name", "topic") if column in columns] + for column in topic_columns: + try: + frame = pd.read_parquet(parquet_path, columns=[column]) + except Exception: + continue + detected.update(_topic_values_from_frame(frame)) + + for csv_path in (paths["current_csv"], paths["future_csv"]): + if not csv_path.exists(): + continue + try: + header = pd.read_csv(csv_path, nrows=0) + except Exception: + continue + topic_columns = [column for column in ("topic_name", "topic") if column in header.columns] + for column in topic_columns: + try: + frame = pd.read_csv(csv_path, usecols=[column], nrows=csv_sample_rows) + except Exception: + continue + detected.update(_topic_values_from_frame(frame)) + + return sorted(detected) + + +def resolve_specsheet_topic_name( + run_dir: str | Path, + requested_topic: str | None, + *, + fallback_topic: str = DEFAULT_SPECSHEET_TOPIC, +) -> tuple[str, list[str]]: + """Resolve the topic that should be used for specsheet generation.""" + requested = str(requested_topic or "").strip() + detected = detect_specsheet_topic_names(run_dir) + if requested and requested in detected: + return requested, detected + if fallback_topic in detected: + return fallback_topic, detected + if len(detected) == 1: + return detected[0], detected + return requested or fallback_topic, detected + + +def _looks_like_specsheet_release_container(path: Path) -> bool: + return ( + (path / TREND_METADATA_FILENAME).exists() + and any((path / role).is_dir() for role in SPECSHEET_RELEASE_ROLE_DIRS) + ) + + +def get_release_specsheet_context(run_dir: str | Path) -> dict[str, Any] | None: + """Return release-folder context for specsheet workflow output, if present.""" + run_path = Path(run_dir) + if _looks_like_specsheet_release_container(run_path): + release_dir = run_path + elif run_path.name in SPECSHEET_RELEASE_ROLE_DIRS and _looks_like_specsheet_release_container(run_path.parent): + release_dir = run_path.parent + else: + return None + + roles: dict[str, dict[str, Path | bool]] = {} + for role in SPECSHEET_RELEASE_ROLE_DIRS: + role_dir = release_dir / role + if not role_dir.is_dir(): + continue + role_paths = get_specsheet_artifact_paths(role_dir) + roles[role] = { + "run_dir": role_dir, + "metadata": role_paths["trend_metadata"], + "summary": role_paths["trend_summary"], + "has_metadata": role_paths["trend_metadata"].exists(), + "has_summary": role_paths["trend_summary"].exists(), + } + + metadata_path = release_dir / TREND_METADATA_FILENAME + if not metadata_path.exists(): + performance_metadata = roles.get("performance", {}).get("metadata") + if isinstance(performance_metadata, Path) and performance_metadata.exists(): + metadata_path = performance_metadata + + return { + "release_dir": release_dir, + "metadata": metadata_path, + "roles": roles, + "performance_dir": roles.get("performance", {}).get("run_dir"), + "devops_dir": roles.get("devops", {}).get("run_dir"), + } + + +def resolve_specsheet_generation_run_path(run_dir: str | Path) -> Path: + """Use the performance child as the PDF body for release workflow folders.""" + run_path = Path(run_dir) + context = get_release_specsheet_context(run_path) + if context is None: + return run_path + performance_dir = context.get("performance_dir") + if isinstance(performance_dir, Path): + return performance_dir + return run_path + + +def list_specsheet_source_parquets(run_dir: str | Path) -> list[Path]: + paths = get_specsheet_artifact_paths(run_dir) + run_path = paths["run_dir"] + ordered: list[Path] = [] + seen: set[Path] = set() + for key in ("current_parquet", "future_parquet"): + path = paths[key] + if path.exists(): + ordered.append(path) + seen.add(path) + for path in sorted(run_path.glob("*.parquet"), key=lambda p: p.name.lower()): + if path not in seen: + ordered.append(path) + seen.add(path) + return ordered + + +def get_latest_source_mtime(run_dir: str | Path) -> float | None: + candidates = list_specsheet_source_parquets(run_dir) + if not candidates: + return None + return max(path.stat().st_mtime for path in candidates if path.exists()) + + +def is_specsheet_pdf_fresh(run_dir: str | Path) -> bool: + paths = get_specsheet_artifact_paths(run_dir) + pdf_path = paths["specsheet_pdf"] + if not pdf_path.exists(): + return False + latest_source_mtime = get_latest_source_mtime(run_dir) + if latest_source_mtime is None: + return True + return pdf_path.stat().st_mtime >= latest_source_mtime + + +def _notify(progress_callback: Callable[[str], None] | None, message: str) -> None: + if progress_callback is not None: + progress_callback(message) + + +@contextmanager +def _patch_block_generation_progress( + progress_callback: Callable[[str], None] | None, +): + if progress_callback is None: + yield + return + + try: + from perception_catalog_analyzer.specsheet import blocks as specsheet_blocks + except ImportError: + yield + return + + original_tqdm = specsheet_blocks.tqdm + + class ProgressTqdm: + def __init__(self, iterable, desc: str | None = None, **kwargs): + self._items = list(iterable) + self._desc = desc or "" + self._current_index = 0 + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def __iter__(self): + for idx, item in enumerate(self._items, start=1): + self._current_index = idx + yield item + + def set_postfix_str(self, text: str) -> None: + total = len(self._items) + if total <= 0: + return + _notify( + progress_callback, + f"{self._desc} {self._current_index}/{total}: {text}", + ) + + specsheet_blocks.tqdm = ProgressTqdm + try: + yield + finally: + specsheet_blocks.tqdm = original_tqdm + + +def _copy_parquet_to_csv(parquet_path: Path, csv_path: Path) -> Path: + frame = pd.read_parquet(parquet_path) + frame.to_csv(csv_path, index=False) + return csv_path + + +def _prefer_cjk_font_stack(html_lines: Sequence[str]) -> list[str]: + rendered = list(html_lines) + generic = "font-family: sans-serif;" + preferred = ( + 'font-family: "Noto Sans CJK JP", "Noto Sans JP", ' + '"IPAGothic", "IPA Gothic", sans-serif;' + ) + return [line.replace(generic, preferred) for line in rendered] + + +def parse_trend_metadata_text(text: str) -> dict[str, Any]: + """Parse and validate manual trend metadata YAML input.""" + raw = yaml.safe_load(text or "") + if not isinstance(raw, dict): + raise ValueError("Trend metadata must be a YAML object with key/value pairs.") + + tags = raw.get("tags") + if isinstance(tags, str): + tags = [tags] + if not isinstance(tags, list) or not any(str(tag).strip() == "trend" for tag in tags): + raise ValueError("Trend metadata must include `tags: [trend]`.") + + pilot_auto_version = str(raw.get("pilot_auto_version") or "").strip() + if not pilot_auto_version: + raise ValueError("Trend metadata requires a non-empty `pilot_auto_version`.") + + data_count = str(raw.get("data_count") or "").strip() + if not data_count or not _TREND_DATA_COUNT_PATTERN.match(data_count): + raise ValueError( + "Trend metadata `data_count` must look like `99,776+` or `12345`." + ) + + description = str(raw.get("description") or "").strip() + date = str(raw.get("date") or "").strip() + if not date or not _TREND_DATE_PATTERN.match(date): + raise ValueError("Trend metadata `date` must look like `2025.11.7`.") + + parsed = { + "tags": ["trend"], + "pilot_auto_version": pilot_auto_version, + "data_count": data_count, + "description": description, + "date": date, + } + for optional_key in ("release_group", "topic_name", "version_abbr"): + optional_value = str(raw.get(optional_key) or "").strip() + if optional_value: + parsed[optional_key] = optional_value + return parsed + + +def _trend_version_abbr(metadata: dict[str, Any]) -> str: + explicit = str(metadata.get("version_abbr") or "").strip() + if explicit: + return explicit + version = str(metadata.get("pilot_auto_version") or "").strip() + if not version: + return "" + try: + from perception_catalog_analyzer.trend import _abbreviate_version + + abbreviated = str(_abbreviate_version(version) or "").strip() + if abbreviated: + return abbreviated + except Exception: + pass + shortened = _PILOT_AUTO_PREFIX_PATTERN.sub("", version).strip() or version + return shortened[:16] + + +def _infer_trend_topic(metadata: dict[str, Any], metadata_path: str | Path) -> str: + explicit = str(metadata.get("topic_name") or "").strip() + if explicit and explicit != DEFAULT_SPECSHEET_TOPIC: + return explicit + for part in reversed(Path(metadata_path).parts): + if part.startswith("perception.") and part != DEFAULT_SPECSHEET_TOPIC: + return part + return DEFAULT_TREND_TOPIC + + +def write_trend_metadata(run_dir: str | Path, metadata: dict[str, Any]) -> Path: + paths = get_specsheet_artifact_paths(run_dir) + resource_dir = paths["resource_dir"] + metadata_path = paths["trend_metadata"] + resource_dir.mkdir(parents=True, exist_ok=True) + with metadata_path.open("w", encoding="utf-8") as fh: + yaml.safe_dump(metadata, fh, allow_unicode=True, sort_keys=False) + return metadata_path + + +def discover_trend_metadata_files(root_dir: str | Path | None = None) -> list[Path]: + base_dir = Path(root_dir) if root_dir is not None else get_data_root() + if not base_dir.exists(): + return [] + + matches: list[Path] = [] + for metadata_path in base_dir.rglob(TREND_METADATA_FILENAME): + if not metadata_path.is_file(): + continue + if GENERATED_TREND_HISTORY_DIRNAME in metadata_path.parts: + continue + if any(part.startswith("release_spec_") for part in metadata_path.parts): + continue + if not (metadata_path.parent / TREND_SUMMARY_FILENAME).exists(): + continue + matches.append(metadata_path) + return sorted(dict.fromkeys(path.resolve() for path in matches), key=lambda p: str(p)) + + +def load_trend_metadata_file(metadata_path: str | Path) -> dict[str, Any]: + with Path(metadata_path).open("r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + if not isinstance(data, dict): + raise ValueError(f"Invalid trend metadata file: {metadata_path}") + return data + + +def load_trend_summary_file(summary_path: str | Path) -> dict[str, Any]: + with Path(summary_path).open("r", encoding="utf-8") as fh: + data = json.load(fh) + if not isinstance(data, dict): + raise ValueError(f"Invalid trend summary file: {summary_path}") + return data + + +def classify_trend_summary(summary: dict[str, Any]) -> str: + blocks = summary.get("blocks") + if isinstance(blocks, list): + headers = [str(block.get("header") or "") for block in blocks] + if "全数データセット評価" in headers: + return "full" + if "ユースケース評価" in headers: + return "usecase" + return "performance_blocks" + if isinstance(summary, dict) and summary: + return "devops" + return "unknown" + + +def _unwrap_devops_summary(summary: dict[str, Any]) -> dict[str, Any]: + devops = summary.get("DevOps") if isinstance(summary, dict) else None + if isinstance(devops, dict): + return devops + return summary + + +def _release_role_key_for_metadata(role: str) -> str: + if role in {"full", "performance_blocks"}: + return "performance" + return role + + +def _job_id_from_run_metadata(run_dir: Path, role: str) -> str: + role_key = _release_role_key_for_metadata(role) + candidates = [run_dir] + if run_dir.parent != run_dir: + candidates.append(run_dir.parent) + + for candidate in candidates: + metadata = read_run_metadata(candidate) + release_specsheet = metadata.get("release_specsheet") if isinstance(metadata.get("release_specsheet"), dict) else {} + evaluator_jobs = release_specsheet.get("evaluator_jobs") if isinstance(release_specsheet.get("evaluator_jobs"), dict) else {} + role_meta = evaluator_jobs.get(role_key) if isinstance(evaluator_jobs.get(role_key), dict) else {} + job_id = str(role_meta.get("job_id") or "").strip() + if job_id: + return job_id + + evaluator_meta = metadata.get("evaluator") if isinstance(metadata.get("evaluator"), dict) else {} + job_id = str(evaluator_meta.get("job_id") or "").strip() + if job_id: + return job_id + + request_meta = metadata.get("request") if isinstance(metadata.get("request"), dict) else {} + parameter_meta = request_meta.get("parameters") if isinstance(request_meta.get("parameters"), dict) else {} + for key in (f"{role_key}_job_id", "job_id"): + job_id = str(parameter_meta.get(key) or request_meta.get(key) or "").strip() + if job_id: + return job_id + return "" + + +def _release_metadata_match(candidate: dict[str, Any], target: dict[str, Any]) -> bool: + for key in ("release_group", "pilot_auto_version", "topic_name", "description", "data_count"): + target_value = str(target.get(key) or "").strip() + if target_value and str(candidate.get(key) or "").strip() != target_value: + return False + return True + + +def _job_id_from_matching_release_run_metadata(root_dir: str | Path | None, target_metadata: dict[str, Any], role: str) -> str: + root = Path(root_dir) if root_dir is not None else get_data_root() + if not root.exists() or not root.is_dir(): + return "" + role_key = _release_role_key_for_metadata(role) + candidates = sorted( + [path for path in root.iterdir() if path.is_dir()], + key=lambda path: path.stat().st_mtime if path.exists() else 0, + reverse=True, + ) + for candidate in candidates: + metadata = read_run_metadata(candidate) + request_meta = metadata.get("request") if isinstance(metadata.get("request"), dict) else {} + parameter_meta = request_meta.get("parameters") if isinstance(request_meta.get("parameters"), dict) else {} + trend_metadata = ( + parameter_meta.get("trend_metadata") + if isinstance(parameter_meta.get("trend_metadata"), dict) + else {} + ) + release_specsheet = metadata.get("release_specsheet") if isinstance(metadata.get("release_specsheet"), dict) else {} + release_metadata = ( + release_specsheet.get("metadata") + if isinstance(release_specsheet.get("metadata"), dict) + else trend_metadata + ) + if not _release_metadata_match(release_metadata, target_metadata): + continue + + evaluator_jobs = release_specsheet.get("evaluator_jobs") if isinstance(release_specsheet.get("evaluator_jobs"), dict) else {} + role_meta = evaluator_jobs.get(role_key) if isinstance(evaluator_jobs.get(role_key), dict) else {} + job_id = str(role_meta.get("job_id") or "").strip() + if job_id: + return job_id + + job_id = str(parameter_meta.get(f"{role_key}_job_id") or request_meta.get(f"{role_key}_job_id") or "").strip() + if job_id: + return job_id + return "" + + +def discover_trend_release_groups(root_dir: str | Path | None = None) -> list[TrendReleaseGroup]: + metadata_files = discover_trend_metadata_files(root_dir) + grouped: dict[str, TrendReleaseGroup] = {} + standalone_records: list[dict[str, Any]] = [] + + for metadata_path in metadata_files: + summary_path = metadata_path.parent / TREND_SUMMARY_FILENAME + summary = load_trend_summary_file(summary_path) + role = classify_trend_summary(summary) + metadata = load_trend_metadata_file(metadata_path) + + if metadata_path.parent.name == "resources": + run_dir = metadata_path.parent.parent + group_key = f"run::{run_dir.resolve()}" + display_name = run_dir.name + topic_name = str(metadata.get("topic_name") or "standalone") + group_kind = "standalone_run" + base_dir = run_dir + standalone_records.append( + { + "group_key": group_key, + "display_name": display_name, + "topic_name": topic_name, + "group_kind": group_kind, + "base_dir": base_dir, + "role": role, + "job_id": str( + metadata.get("job_id") + or _job_id_from_run_metadata(run_dir, role) + or _job_id_from_matching_release_run_metadata(root_dir, metadata, role) + or "" + ), + "metadata_path": metadata_path, + "summary_path": summary_path, + "metadata": metadata, + "summary": summary, + } + ) + continue + else: + job_dir = metadata_path.parent + topic_dir = job_dir.parent + combined_dir = topic_dir.parent + group_key = f"group::{combined_dir.resolve()}::{topic_dir.name}" + display_name = combined_dir.name + topic_name = topic_dir.name + group_kind = "library_pdf_group" + base_dir = combined_dir + + if group_key not in grouped: + grouped[group_key] = TrendReleaseGroup( + group_key=group_key, + display_name=display_name, + topic_name=topic_name, + group_kind=group_kind, + base_dir=base_dir, + jobs={}, + ) + grouped[group_key].jobs[role] = { + "role": role, + "job_id": str( + metadata.get("job_id") + or _job_id_from_run_metadata(metadata_path.parent, role) + or _job_id_from_matching_release_run_metadata(root_dir, metadata, role) + or (metadata_path.parent.name if metadata_path.parent.name != "resources" else run_dir.name) + ), + "metadata_path": metadata_path.resolve(), + "summary_path": summary_path.resolve(), + "metadata": metadata, + "summary": summary, + } + + standalone_by_release: dict[tuple[str, str, str, str, str, str], list[dict[str, Any]]] = {} + for record in standalone_records: + metadata = record["metadata"] + release_key = ( + str(metadata.get("release_group") or ""), + str(record["topic_name"] or ""), + str(metadata.get("pilot_auto_version") or ""), + str(metadata.get("date") or ""), + str(metadata.get("description") or ""), + str(metadata.get("data_count") or ""), + ) + standalone_by_release.setdefault(release_key, []).append(record) + + for release_key, records in standalone_by_release.items(): + role_counts: dict[str, int] = {} + for record in records: + role = str(record["role"]) + role_counts[role] = role_counts.get(role, 0) + 1 + + can_group = len(records) > 1 and all(count == 1 for count in role_counts.values()) + if can_group: + sample = records[0] + metadata = sample["metadata"] + release_label = ( + str(metadata.get("release_group") or "").strip() + or str(metadata.get("pilot_auto_version") or "").strip() + or "standalone_release" + ) + date_label = str(metadata.get("date") or "").strip() + display_name = f"{release_label} | {date_label}" if date_label else release_label + group_key = "standalone_group::" + "::".join(release_key) + grouped[group_key] = TrendReleaseGroup( + group_key=group_key, + display_name=display_name, + topic_name=str(sample["topic_name"]), + group_kind="standalone_release_group", + base_dir=Path(root_dir) if root_dir is not None else get_data_root(), + jobs={}, + ) + target_group = grouped[group_key] + for record in records: + target_group.jobs[str(record["role"])] = { + "role": record["role"], + "job_id": record["job_id"], + "metadata_path": record["metadata_path"].resolve(), + "summary_path": record["summary_path"].resolve(), + "metadata": record["metadata"], + "summary": record["summary"], + } + continue + + for record in records: + group_key = str(record["group_key"]) + grouped[group_key] = TrendReleaseGroup( + group_key=group_key, + display_name=str(record["display_name"]), + topic_name=str(record["topic_name"]), + group_kind=str(record["group_kind"]), + base_dir=record["base_dir"], + jobs={ + str(record["role"]): { + "role": record["role"], + "job_id": record["job_id"], + "metadata_path": record["metadata_path"].resolve(), + "summary_path": record["summary_path"].resolve(), + "metadata": record["metadata"], + "summary": record["summary"], + } + }, + ) + + def _sort_key(group: TrendReleaseGroup) -> tuple[str, str]: + dates = [ + str(job["metadata"].get("date") or "") + for job in group.jobs.values() + if isinstance(job.get("metadata"), dict) + ] + newest = max(dates) if dates else "" + return (newest, group.display_name) + + return sorted(_deduplicate_trend_release_groups(grouped.values()), key=_sort_key) + + +def _trend_group_identity(group: TrendReleaseGroup) -> tuple[str, str, str, str, str, str, tuple[str, ...]]: + metadata = {} + for role in ("full", "usecase", "devops", "performance_blocks", "unknown"): + if role in group.jobs: + metadata = group.jobs[role].get("metadata", {}) + break + return ( + str(metadata.get("release_group") or ""), + str(group.topic_name or ""), + str(metadata.get("pilot_auto_version") or ""), + str(metadata.get("date") or ""), + str(metadata.get("description") or ""), + str(metadata.get("data_count") or ""), + tuple(sorted(group.jobs.keys())), + ) + + +def _trend_group_preference(group: TrendReleaseGroup) -> tuple[int, int, str]: + generated_history = any( + GENERATED_TREND_HISTORY_DIRNAME in Path(job.get("metadata_path", "")).parts + for job in group.jobs.values() + ) + return ( + 0 if generated_history else 1, + len(group.jobs), + str(group.base_dir), + ) + + +def _deduplicate_trend_release_groups(groups: Iterable[TrendReleaseGroup]) -> list[TrendReleaseGroup]: + selected: dict[tuple[str, str, str, str, str, str, tuple[str, ...]], TrendReleaseGroup] = {} + for group in groups: + identity = _trend_group_identity(group) + current = selected.get(identity) + if current is None or _trend_group_preference(group) > _trend_group_preference(current): + selected[identity] = group + return list(selected.values()) + + +def _trend_version_sort_key(pilot_auto_version: str) -> tuple[tuple[int, int, int], str, tuple[int, int, int]]: + pattern = r"v(\d+)\.(\d+)\.(\d+)\s*\(([^ ]+)\s+(.+)\)" + match = re.search(pattern, str(pilot_auto_version or "")) + if not match: + return ((999, 999, 999), str(pilot_auto_version or ""), (999, 999, 999)) + + major = int(match.group(1)) + minor = int(match.group(2)) + patch = int(match.group(3)) + ml_model_type = match.group(4) + ml_model_info = match.group(5) + try: + _, ml_model_version = ml_model_info.split("/") + ml_major, ml_minor, ml_patch = ml_model_version.split(".") + ml_version = (int(ml_major), int(ml_minor), int(ml_patch)) + except ValueError: + ml_version = (999, 999, 999) + return ((major, minor, patch), ml_model_type, ml_version) + + +def _canonical_summary_table_key(table_data: dict[str, Any]) -> str: + return json.dumps(table_data, ensure_ascii=False, sort_keys=True, allow_nan=True) + + +def _deduplicate_summary_tables(data_list: Sequence[dict[str, Any]]) -> list[dict[str, Any]]: + deduplicated: list[dict[str, Any]] = [] + seen: set[str] = set() + for table_data in data_list: + key = _canonical_summary_table_key(table_data) + if key in seen: + continue + seen.add(key) + deduplicated.append(table_data) + return deduplicated + + +def _extract_full_metric_tables(summary: dict[str, Any]) -> list[dict[str, Any]]: + data_list: list[dict[str, Any]] = [] + blocks = summary.get("blocks", []) + if not isinstance(blocks, list): + return data_list + for block in blocks: + if not isinstance(block, dict): + continue + if block.get("header") != FULL_DATASET_EVALUATION_HEADER: + continue + if block.get("mode") not in (None, "metrics"): + continue + if block.get("evaluation_type") not in (None, "full"): + continue + block_tables = block.get("tables", []) + if not isinstance(block_tables, list): + continue + for tables in block_tables: + if not isinstance(tables, dict): + continue + table_data = tables.get("data", {}) + if isinstance(table_data, dict) and table_data: + data_list.append(table_data) + return _deduplicate_summary_tables(data_list) + + +def _load_only_full_summary(summary_path: Path) -> list[dict[str, Any]]: + summary = load_trend_summary_file(summary_path) + return _extract_full_metric_tables(summary) + + +def ensure_full_trend_summary(summary_path: str | Path) -> Path: + """Validate that analyzer block generation produced a full trend summary.""" + path = Path(summary_path) + if not path.exists(): + raise FileNotFoundError( + f"Full trend summary was not created: {path}. " + "The analyzer must write resources/summary.json before trend PDF generation." + ) + summary = load_trend_summary_file(path) + role = classify_trend_summary(summary) + if role != "full": + raise ValueError(f"Expected a full trend summary at {path}, but it classified as `{role}`.") + extract_performance_metrics_from_summary(summary) + return path + + +def extract_performance_metrics_from_summary(summary: dict[str, Any]) -> dict[str, float]: + """Return averaged full-performance metrics from a full summary payload.""" + data_list = _extract_full_metric_tables(summary) + + if len(data_list) != 1: + raise ValueError(f"Expected exactly one distinct full summary table, but got {len(data_list)}") + metrics = data_list[0] + + def _avg(metric_name: str) -> float: + values = metrics.get(metric_name, {}) + if not isinstance(values, dict) or not values: + return float("nan") + numeric = pd.to_numeric(pd.Series(list(values.values())), errors="coerce") + return float(numeric.mean()) + + return { + "mAP": _avg("mAP"), + "precision": _avg("precision"), + "recall": _avg("recall"), + "FNR": _avg("FNR"), + "x_error": _avg("x_error"), + "y_error": _avg("y_error"), + "yaw_error": _avg("yaw_error"), + "speed_error": _avg("speed_error"), + "minADE@1s": _avg("minADE@1s"), + "minFDE@1s": _avg("minFDE@1s"), + "minADE@3s": _avg("minADE@3s"), + "minFDE@3s": _avg("minFDE@3s"), + "minADE@5s": _avg("minADE@5s"), + "minFDE@5s": _avg("minFDE@5s"), + } + + +def extract_devops_case_rows(summary: dict[str, Any]) -> list[dict[str, Any]]: + """Flatten nested devops/pass-rate summary into case rows.""" + summary = _unwrap_devops_summary(summary) + rows: list[dict[str, Any]] = [] + for major_category, mid_categories in summary.items(): + if not isinstance(mid_categories, dict): + continue + for mid_category, minor_or_cases in mid_categories.items(): + if not isinstance(minor_or_cases, dict): + continue + for minor_or_case_name, result_or_cases in minor_or_cases.items(): + if not isinstance(result_or_cases, dict): + continue + if {"passed", "total"}.intersection(result_or_cases.keys()): + case_items = [(minor_or_case_name, result_or_cases)] + minor_category = minor_or_case_name + else: + case_items = [ + (case_name, result) + for case_name, result in result_or_cases.items() + if isinstance(result, dict) + ] + minor_category = minor_or_case_name + + for case_name, result in case_items: + passed = int(result.get("passed", 0) or 0) + total = int(result.get("total", 0) or 0) + rows.append( + { + "major_category": major_category, + "mid_category": mid_category, + "minor_category": minor_category, + "case_name": case_name, + "passed": passed, + "total": total, + "pass_rate": (passed / total * 100.0) if total > 0 else None, + } + ) + return rows + + +def _normalize_devops_summary_structure(summary: dict[str, Any]) -> dict[str, dict[str, dict[str, dict[str, int]]]]: + summary = _unwrap_devops_summary(summary) + normalized: dict[str, dict[str, dict[str, dict[str, int]]]] = {} + for major_category, mid_categories in summary.items(): + if not isinstance(mid_categories, dict): + continue + normalized_major = normalized.setdefault(str(major_category), {}) + for mid_category, minor_or_cases in mid_categories.items(): + if not isinstance(minor_or_cases, dict): + continue + normalized_mid = normalized_major.setdefault(str(mid_category), {}) + if {"passed", "total"}.intersection(minor_or_cases.keys()): + normalized_mid[str(mid_category)] = { + "passed": int(minor_or_cases.get("passed", 0) or 0), + "total": int(minor_or_cases.get("total", 0) or 0), + } + continue + for case_name, result in minor_or_cases.items(): + if not isinstance(result, dict): + continue + normalized_mid[str(case_name)] = { + "passed": int(result.get("passed", 0) or 0), + "total": int(result.get("total", 0) or 0), + } + return normalized + + +def _align_devops_trend_data_structures(trend_data_rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + structure: dict[str, dict[str, set[str]]] = {} + for row in trend_data_rows: + devops_data = _normalize_devops_summary_structure(row.get("devops_data", {})) + row["devops_data"] = devops_data + for major_category, mid_categories in devops_data.items(): + major_structure = structure.setdefault(major_category, {}) + for mid_category, cases in mid_categories.items(): + major_structure.setdefault(mid_category, set()).update(cases.keys()) + + for row in trend_data_rows: + devops_data = row.get("devops_data", {}) + if not isinstance(devops_data, dict): + devops_data = {} + row["devops_data"] = devops_data + for major_category, mid_categories in structure.items(): + row_major = devops_data.setdefault(major_category, {}) + for mid_category, cases in mid_categories.items(): + row_mid = row_major.setdefault(mid_category, {}) + for case_name in cases: + row_mid.setdefault(case_name, {"passed": 0, "total": 0}) + return trend_data_rows + + +def load_performance_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, str | int | float]]: + trend_data_rows: list[dict[str, Any]] = [] + for metadata_path in metadata_list: + metadata = load_trend_metadata_file(metadata_path) + if "trend" not in [str(tag).strip() for tag in metadata.get("tags", [])]: + continue + summary_path = Path(metadata_path).parent / TREND_SUMMARY_FILENAME + if not summary_path.exists(): + continue + summary_list = _load_only_full_summary(summary_path) + if not summary_list: + continue + trend_data_rows.append( + { + "version": metadata.get("pilot_auto_version"), + "version_abbr": _trend_version_abbr(metadata), + "data_count": metadata.get("data_count"), + "description": metadata.get("description"), + "date": metadata.get("date"), + "topic": _infer_trend_topic(metadata, metadata_path), + "summary": summary_list, + } + ) + + trend_data_rows.sort(key=lambda row: _trend_version_sort_key(str(row.get("version") or ""))) + + output: list[dict[str, str | int | float]] = [] + for row in trend_data_rows: + summary = row.get("summary") or [] + if len(summary) != 1: + raise ValueError( + f"Expected exactly one distinct summary block for version {row.get('version')}, " + f"but got {len(summary)}" + ) + metrics = summary[0] + + def _avg(metric_name: str) -> float: + values = metrics.get(metric_name, {}) + if not isinstance(values, dict) or not values: + return float("nan") + numeric = pd.to_numeric(pd.Series(list(values.values())), errors="coerce") + return float(numeric.mean()) + + output.append( + { + "version": row.get("version"), + "version_abbr": row.get("version_abbr"), + "data_count": row.get("data_count"), + "description": row.get("description"), + "date": row.get("date"), + "topic": row.get("topic"), + "mAP": _avg("mAP"), + "precision": _avg("precision"), + "recall": _avg("recall"), + "minADE@1s": _avg("minADE@1s"), + "minFDE@1s": _avg("minFDE@1s"), + "minADE@3s": _avg("minADE@3s"), + "minFDE@3s": _avg("minFDE@3s"), + "minADE@5s": _avg("minADE@5s"), + "minFDE@5s": _avg("minFDE@5s"), + } + ) + return output + + +def load_devops_trend_data(metadata_list: Sequence[Path]) -> list[dict[str, Any]]: + trend_data_rows: list[dict[str, Any]] = [] + for metadata_path in metadata_list: + metadata = load_trend_metadata_file(metadata_path) + if "trend" not in [str(tag).strip() for tag in metadata.get("tags", [])]: + continue + summary_path = Path(metadata_path).parent / TREND_SUMMARY_FILENAME + if not summary_path.exists(): + continue + summary = load_trend_summary_file(summary_path) + if classify_trend_summary(summary) != "devops": + continue + + rows = extract_devops_case_rows(summary) + if not rows: + continue + normalized_summary = _normalize_devops_summary_structure(summary) + overall_passed = sum(int(row["passed"]) for row in rows) + overall_total = sum(int(row["total"]) for row in rows) + trend_data_rows.append( + { + "version": metadata.get("pilot_auto_version"), + "version_abbr": _trend_version_abbr(metadata), + "data_count": metadata.get("data_count"), + "description": metadata.get("description"), + "date": metadata.get("date"), + "topic": _infer_trend_topic(metadata, metadata_path), + "overall_pass_rate": (overall_passed / overall_total * 100.0) + if overall_total > 0 + else 0.0, + "scenario_count": overall_total, + "devops_data": normalized_summary, + } + ) + + trend_data_rows.sort(key=lambda row: _trend_version_sort_key(str(row.get("version") or ""))) + return _align_devops_trend_data_structures(trend_data_rows) + + +def _add_devops_detail_trend_rates(devops_trend_data: Sequence[dict[str, Any]]) -> list[str]: + cases: set[str] = set() + for row in devops_trend_data: + devops_data = row.get("devops_data", {}) + if not isinstance(devops_data, dict): + continue + for mid_categories in devops_data.values(): + if not isinstance(mid_categories, dict): + continue + for sub_category, sub_categories in mid_categories.items(): + if not isinstance(sub_categories, dict): + continue + total_passed = sum( + int(result.get("passed", 0) or 0) + for result in sub_categories.values() + if isinstance(result, dict) + ) + total = sum( + int(result.get("total", 0) or 0) + for result in sub_categories.values() + if isinstance(result, dict) + ) + row[sub_category] = total_passed / total * 100.0 if total > 0 else 0.0 + cases.add(str(sub_category)) + return sorted(cases) + + +def _devops_trend_rows_for_template(devops_trend_data: Sequence[dict[str, Any]]) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for row in devops_trend_data: + display_row = dict(row) + version_abbr = str(display_row.get("version_abbr") or "").strip() + if version_abbr: + display_row["version"] = version_abbr + rows.append(display_row) + return rows + + +def _build_trend_context( + metadata_list: Sequence[Path], + output_dir: Path, + current_devops_summary_path: Path | None = None, + progress_callback: Callable[[str], None] | None = None, +) -> dict[str, object]: + if not metadata_list: + return { + "performance_trend_data": [], + "map_trend_plot_path": output_dir / "map_trend.png", + "prediction_trend_plot_path": output_dir / "prediction_trend.png", + "devops_data": {}, + "devops_plot_path": None, + "devops_trend_data": [], + "devops_trend_plot_path": output_dir / "devops_trend.png", + "job_ids": [], + } + + try: + from perception_catalog_analyzer.plot.map_trend import generate_map_trend_plot + from perception_catalog_analyzer.plot.prediction_trend import generate_prediction_trend_plot + from perception_catalog_analyzer.plot.devops_trend import ( + generate_devops_trend_detail_plot, + generate_devops_trend_plot, + ) + from perception_catalog_analyzer.plot.devops import generate_devops_plot + except ImportError as exc: + raise RuntimeError( + "perception_catalog_analyzer trend support is unavailable. " + f"Original error: {exc!s}" + ) from exc + + output_dir.mkdir(parents=True, exist_ok=True) + _notify(progress_callback, "Collecting trend history") + performance_trend_data = load_performance_trend_data(list(metadata_list)) + map_trend_plot_path = output_dir / "map_trend.png" + prediction_trend_plot_path = output_dir / "prediction_trend.png" + if performance_trend_data: + _notify(progress_callback, "Rendering trend plots") + generate_map_trend_plot(performance_trend_data, map_trend_plot_path) + generate_prediction_trend_plot(performance_trend_data, prediction_trend_plot_path) + + devops_trend_data = load_devops_trend_data(list(metadata_list)) + devops_trend_plot_path = output_dir / "devops_trend.png" + devops_data = {} + devops_plot_path = None + if current_devops_summary_path is not None and current_devops_summary_path.exists(): + current_devops_summary = load_trend_summary_file(current_devops_summary_path) + if classify_trend_summary(current_devops_summary) == "devops": + devops_data = _normalize_devops_summary_structure(current_devops_summary) + if devops_data: + _notify(progress_callback, "Rendering current pass-rate plot") + devops_plot_path = output_dir / "devops.png" + generate_devops_plot(devops_data, devops_plot_path) + if devops_trend_data: + _notify(progress_callback, "Rendering pass-rate trend plots") + generate_devops_trend_plot(devops_trend_data, devops_trend_plot_path) + detail_cases = _add_devops_detail_trend_rates(devops_trend_data) + if detail_cases: + generate_devops_trend_detail_plot( + devops_trend_data, + detail_cases, + devops_trend_plot_path, + ) + + return { + "performance_trend_data": performance_trend_data, + "map_trend_plot_path": map_trend_plot_path, + "prediction_trend_plot_path": prediction_trend_plot_path, + "devops_data": devops_data, + "devops_plot_path": devops_plot_path, + "devops_trend_data": _devops_trend_rows_for_template(devops_trend_data), + "devops_trend_plot_path": devops_trend_plot_path, + "job_ids": [], + } + + +def _update_template_compat( + update_template_func: Callable[..., Sequence[str]], + project_id: str, + version: str, + *, + template_dir: Path, + context_dir: Path, + trend_context: dict[str, object] | None = None, +) -> Sequence[str]: + """Call update_template across analyzer versions with different signatures.""" + try: + parameters = inspect.signature(update_template_func).parameters + except (TypeError, ValueError): + parameters = {} + + trend_context = trend_context or {} + path_manager = SimpleNamespace(specsheet_path=context_dir) + semantic_kwargs = { + "project_id": project_id, + "pilot_auto_version": version, + "version": version, + "devops_data": trend_context.get("devops_data", {}), + "devops_plot_path": trend_context.get("devops_plot_path"), + "performance_trend_data": trend_context.get("performance_trend_data", []), + "map_trend_plot_path": trend_context.get("map_trend_plot_path", context_dir / "map_trend.png"), + "prediction_trend_plot_path": trend_context.get( + "prediction_trend_plot_path", context_dir / "prediction_trend.png" + ), + "devops_trend_data": trend_context.get("devops_trend_data", []), + "devops_trend_plot_path": trend_context.get( + "devops_trend_plot_path", context_dir / "devops_trend.png" + ), + "job_ids": trend_context.get("job_ids", []), + "template_name": "static_body.html", + "extensions": ["html"], + "template_dir": str(template_dir), + "path_manager": path_manager, + "show_other_infos": bool(trend_context.get("performance_trend_data")), + } + + accepts_kwargs = any( + param.kind == inspect.Parameter.VAR_KEYWORD for param in parameters.values() + ) + if accepts_kwargs or not parameters: + with _patch_template_dataset_paths(update_template_func, context_dir): + return update_template_func(**semantic_kwargs) + + args: list[object] = [] + kwargs: dict[str, object] = {} + for name, param in parameters.items(): + if name not in semantic_kwargs: + continue + value = semantic_kwargs[name] + if param.kind in ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ): + args.append(value) + elif param.kind == inspect.Parameter.KEYWORD_ONLY: + kwargs[name] = value + with _patch_template_dataset_paths(update_template_func, context_dir): + return update_template_func(*args, **kwargs) + + +@contextmanager +def _patch_template_dataset_paths( + update_template_func: Callable[..., Sequence[str]], + context_dir: Path, +): + """Redirect analyzer dataset-summary outputs away from read-only package config.""" + globals_dict = getattr(update_template_func, "__globals__", {}) + patch_keys = ("DATASET_SUMMARY_PATH", "DATASET_TRAIN_PATH", "DATASET_TEST_PATH") + originals = {key: globals_dict.get(key) for key in patch_keys if key in globals_dict} + if not originals: + yield + return + + dataset_dir = context_dir / "dataset_assets" + dataset_dir.mkdir(parents=True, exist_ok=True) + try: + for key, original_path in originals.items(): + if not isinstance(original_path, Path) or not original_path.exists(): + continue + target_path = dataset_dir / original_path.name + if not target_path.exists(): + shutil.copy2(original_path, target_path) + globals_dict[key] = target_path + yield + finally: + for key, original_path in originals.items(): + globals_dict[key] = original_path + +def _scene_dataframe_from_dir_compat( + scene_dataframe_cls, + run_path: Path, + *, + topic_name: str, +): + """Call SceneDataFrame.from_dir across analyzer versions with/without topic.""" + from_dir = scene_dataframe_cls.from_dir + try: + parameters = inspect.signature(from_dir).parameters + except (TypeError, ValueError): + parameters = {} + + required_parameters = [ + param + for param in parameters.values() + if param.kind in ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + inspect.Parameter.KEYWORD_ONLY, + ) + and param.default is inspect.Parameter.empty + ] + accepts_varargs = any( + param.kind in (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD) + for param in parameters.values() + ) + + if accepts_varargs or len(required_parameters) >= 2: + return from_dir(run_path, topic_name) + return from_dir(run_path) + + +_CURRENT_NUMERIC_COLUMNS = { + "unix_time", + "x", + "y", + "confidence", + "pointcloud_num", + "visibility", + "x_error", + "y_error", + "yaw_error", + "speed_error", + "frame_index", +} +_FUTURE_NUMERIC_COLUMNS = { + "x", + "y", + "tx", + "ty", + "confidence", + "visibility", + "relative_time", + "pair_dt_sec", +} + + +def _coerce_numeric_columns(frame: pd.DataFrame, columns: set[str]) -> pd.DataFrame: + if frame.empty: + return frame + coerced = frame.copy() + for column in sorted(columns.intersection(coerced.columns)): + coerced[column] = pd.to_numeric(coerced[column], errors="coerce") + return coerced + + +def _coerce_specsheet_scene_numeric_columns(df): + """Normalize analyzer-loaded CSV values before NumPy-heavy specsheet metrics.""" + if hasattr(df, "current"): + df.current = _coerce_numeric_columns(df.current, _CURRENT_NUMERIC_COLUMNS) + if getattr(df, "future", None) is not None: + df.future = _coerce_numeric_columns(df.future, _FUTURE_NUMERIC_COLUMNS) + return df + if isinstance(df, pd.DataFrame): + return _coerce_numeric_columns( + df, + _CURRENT_NUMERIC_COLUMNS | _FUTURE_NUMERIC_COLUMNS, + ) + return df + + +def _get_blocks_compat( + get_blocks_func: Callable[..., tuple[Sequence[str], Sequence[str]]], + *, + df, + labels: Sequence[str], + metrics: Sequence[str], + topic_name: str, + outdir: Path, + evaluation_type: str, +): + """Call get_blocks across analyzer versions with different keyword support.""" + parquet_compression = "snappy" + try: + from perception_catalog_analyzer.types import ParquetCompression + + parquet_compression = ParquetCompression.SNAPPY + except Exception: + pass + + semantic_kwargs = { + "df": df, + "labels": list(labels), + "metrics": list(metrics), + "resource_path": outdir, + "html_path": outdir.parent if outdir.name == "resources" else outdir, + "parquet_compression": parquet_compression, + "topic_name": topic_name, + "topic": topic_name, + "path": outdir, + "outdir": outdir, + "evaluation_type": evaluation_type, + } + try: + parameters = inspect.signature(get_blocks_func).parameters + except (TypeError, ValueError): + parameters = {} + + accepts_kwargs = any( + param.kind == inspect.Parameter.VAR_KEYWORD for param in parameters.values() + ) + if accepts_kwargs or not parameters: + return get_blocks_func(**semantic_kwargs) + + args: list[object] = [] + kwargs: dict[str, object] = {} + for name, param in parameters.items(): + if name not in semantic_kwargs: + continue + value = semantic_kwargs[name] + if param.kind in ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ): + args.append(value) + elif param.kind == inspect.Parameter.KEYWORD_ONLY: + kwargs[name] = value + return get_blocks_func(*args, **kwargs) + + +def _specsheet_compat( + specsheet_func: Callable[..., None], + *, + html: Sequence[str], + abstract_html: Sequence[str], + detailed_html: Sequence[str], + outdir: Path, + report_name: str, +) -> None: + """Call specsheet across analyzer versions with path/outdir differences.""" + path_manager = SimpleNamespace(specsheet_path=outdir) + semantic_kwargs = { + "html": list(html), + "abstract_html": list(abstract_html), + "detailed_html": list(detailed_html), + "path_manager": path_manager, + "path": outdir, + "outdir": outdir, + "report_name": report_name, + } + try: + parameters = inspect.signature(specsheet_func).parameters + except (TypeError, ValueError): + parameters = {} + + accepts_kwargs = any( + param.kind == inspect.Parameter.VAR_KEYWORD for param in parameters.values() + ) + if accepts_kwargs or not parameters: + specsheet_func(**semantic_kwargs) + return + + args: list[object] = [] + kwargs: dict[str, object] = {} + for name, param in parameters.items(): + if name not in semantic_kwargs: + continue + value = semantic_kwargs[name] + if param.kind in ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ): + args.append(value) + elif param.kind == inspect.Parameter.KEYWORD_ONLY: + kwargs[name] = value + specsheet_func(*args, **kwargs) + + +def ensure_specsheet_csvs( + run_dir: str | Path, + *, + progress_callback: Callable[[str], None] | None = None, +) -> dict[str, Path | None]: + paths = get_specsheet_artifact_paths(run_dir) + current_csv = paths["current_csv"] + future_csv = paths["future_csv"] + current_parquet = paths["current_parquet"] + future_parquet = paths["future_parquet"] + + if not current_csv.exists(): + if current_parquet.exists(): + _notify(progress_callback, f"Converting {current_parquet.name} -> {current_csv.name}") + _copy_parquet_to_csv(current_parquet, current_csv) + elif list_specsheet_source_parquets(run_dir): + fallback = list_specsheet_source_parquets(run_dir)[0] + _notify(progress_callback, f"Converting {fallback.name} -> {current_csv.name}") + _copy_parquet_to_csv(fallback, current_csv) + else: + _notify(progress_callback, "No CSV found. Building CSV from pkl / pkl.z files") + from lib.perception_catalog_io import build_scene_dataframe_from_pkl_dir + + skip_counts: dict[str, int] = {} + + def _on_progress(done: int, total: int) -> None: + _notify(progress_callback, f"Processing pkl files {done}/{total}") + + def _on_skip(path: str | Path, reason: str) -> None: + skip_counts[reason] = skip_counts.get(reason, 0) + 1 + + df = build_scene_dataframe_from_pkl_dir( + run_dir, + on_progress=_on_progress, + on_skip=_on_skip, + ) + if skip_counts: + details = ", ".join( + f"{count} {reason}" for reason, count in sorted(skip_counts.items()) + ) + _notify(progress_callback, f"Skipped pkl files: {details}") + df.to_csv(run_dir) + if not current_csv.exists(): + raise FileNotFoundError(f"Failed to generate {current_csv}") + + if not future_csv.exists() and future_parquet.exists(): + _notify(progress_callback, f"Converting {future_parquet.name} -> {future_csv.name}") + _copy_parquet_to_csv(future_parquet, future_csv) + + return { + "current_csv": current_csv if current_csv.exists() else None, + "future_csv": future_csv if future_csv.exists() else None, + } + + +def generate_specsheet_pdf( + run_dir: str | Path, + *, + project_id: str, + version: str, + labels: Sequence[str], + topic_name: str = DEFAULT_SPECSHEET_TOPIC, + include_trend: bool = False, + trend_metadata: dict[str, Any] | None = None, + force: bool = False, + progress_callback: Callable[[str], None] | None = None, +) -> tuple[Path, bool]: + paths = get_specsheet_artifact_paths(run_dir) + specsheet_dir = paths["specsheet_dir"] + pdf_path = paths["specsheet_pdf"] + + if not force and is_specsheet_pdf_fresh(run_dir): + _notify(progress_callback, "Using existing up-to-date spec-sheet PDF") + return pdf_path, False + + ensure_specsheet_csvs(run_dir, progress_callback=progress_callback) + resolved_topic, detected_topics = resolve_specsheet_topic_name(run_dir, topic_name) + if resolved_topic != topic_name: + detected_text = ", ".join(detected_topics) if detected_topics else "none" + _notify( + progress_callback, + f"Using detected topic {resolved_topic} instead of requested topic {topic_name} (detected: {detected_text})", + ) + topic_name = resolved_topic + + try: + from perception_catalog_analyzer.dataframe import SceneDataFrame + from perception_catalog_analyzer.specsheet import get_blocks, specsheet + from perception_catalog_analyzer import template as template_module + from perception_catalog_analyzer.template import update_template + except ImportError as exc: + raise RuntimeError( + "perception_catalog_analyzer spec-sheet generation is unavailable. " + f"Install the dependency first. Original error: {exc!s}" + ) from exc + + run_path = paths["run_dir"] + resource_dir = run_path / "resources" + resource_dir.mkdir(parents=True, exist_ok=True) + specsheet_dir.mkdir(parents=True, exist_ok=True) + block_resource_dir = specsheet_dir / "resources" + block_resource_dir.mkdir(parents=True, exist_ok=True) + trend_asset_dir = specsheet_dir / "trend_assets" + trend_asset_dir.mkdir(parents=True, exist_ok=True) + + _notify(progress_callback, "Loading CSV files") + df = _scene_dataframe_from_dir_compat( + SceneDataFrame, + run_path, + topic_name=topic_name, + ) + df = _coerce_specsheet_scene_numeric_columns(df) + metrics = list(DEFAULT_SPECSHEET_METRICS) + if getattr(df, "future", None) is not None: + metrics.extend(FUTURE_SPECSHEET_METRICS) + + _notify(progress_callback, "Building abstract and detail sections") + with _patch_block_generation_progress(progress_callback): + abstract, detailed = _get_blocks_compat( + get_blocks, + df=df, + labels=list(labels), + metrics=metrics, + topic_name=topic_name, + outdir=block_resource_dir.resolve(), + evaluation_type="full", + ) + + trend_context: dict[str, object] | None = None + if include_trend: + if trend_metadata is None: + raise ValueError("Trend metadata is required when trend mode is enabled.") + _notify(progress_callback, "Validating full trend summary") + generated_trend_summary = block_resource_dir / TREND_SUMMARY_FILENAME + trend_summary_path = generated_trend_summary if generated_trend_summary.exists() else paths["trend_summary"] + ensure_full_trend_summary(trend_summary_path) + if generated_trend_summary.exists() and not paths["trend_summary"].exists(): + shutil.copy2(generated_trend_summary, paths["trend_summary"]) + _notify(progress_callback, "Saving trend metadata") + write_trend_metadata(run_path, trend_metadata) + metadata_list = discover_trend_metadata_files() + release_context = get_release_specsheet_context(run_path) + current_devops_summary_path = None + if release_context is not None: + roles = release_context.get("roles", {}) + if isinstance(roles, dict): + devops_info = roles.get("devops", {}) + if isinstance(devops_info, dict): + summary_path = devops_info.get("summary") + if isinstance(summary_path, Path): + current_devops_summary_path = summary_path + trend_context = _build_trend_context( + metadata_list, + trend_asset_dir, + current_devops_summary_path=current_devops_summary_path, + progress_callback=progress_callback, + ) + + _notify(progress_callback, "Rendering PDF") + for stale_output in (specsheet_dir / "specsheet.html", pdf_path): + if stale_output.exists() and not os.access(stale_output, os.W_OK): + stale_output.unlink() + template_dir = Path(template_module.__file__).resolve().parent.parent / "template" + html = _prefer_cjk_font_stack( + _update_template_compat( + update_template, + project_id, + version, + template_dir=template_dir, + context_dir=specsheet_dir, + trend_context=trend_context, + ) + ) + _specsheet_compat( + specsheet, + html=html, + abstract_html=abstract, + detailed_html=detailed, + outdir=specsheet_dir, + report_name="specsheet", + ) + if not pdf_path.exists(): + raise FileNotFoundError(f"Spec-sheet PDF was not created: {pdf_path}") + _notify(progress_callback, "Spec-sheet PDF is ready") + return pdf_path, True + + +def collect_candidate_specsheet_labels( + run_dir: str | Path, + *, + preferred: Iterable[str] | None = None, +) -> list[str]: + preferred_labels = [str(v) for v in (preferred or []) if str(v).strip()] + if preferred_labels: + return sorted(dict.fromkeys(preferred_labels)) + + paths = get_specsheet_artifact_paths(run_dir) + for source in ( + paths["current_csv"], + paths["current_parquet"], + ): + if not source.exists(): + continue + try: + if source.suffix == ".csv": + frame = pd.read_csv(source) + else: + frame = pd.read_parquet(source, columns=["label"]) + if "label" not in frame.columns: + continue + labels = [str(v) for v in frame["label"].dropna().unique() if str(v).strip()] + if labels: + return sorted(labels) + except Exception: + continue + return [] + + +_PROGRESS_FRACTION_PATTERN = re.compile(r"(?P\d+)\s*/\s*(?P\d+)") + + +def progress_fraction_from_message(message: str) -> float | None: + match = _PROGRESS_FRACTION_PATTERN.search(message or "") + if not match: + return None + done = int(match.group("done")) + total = int(match.group("total")) + if total <= 0: + return None + return max(0.0, min(1.0, done / total)) diff --git a/evaluation_dashboard_app/lib/summary_compare.py b/evaluation_dashboard_app/lib/summary_compare.py index bb7272c..e152409 100644 --- a/evaluation_dashboard_app/lib/summary_compare.py +++ b/evaluation_dashboard_app/lib/summary_compare.py @@ -21,3 +21,56 @@ def build_summary_delta(df_a: pd.DataFrame, df_b: pd.DataFrame) -> pd.DataFrame: result[f"{m}_B"] = df_b.loc[common_idx, m] result[f"{m}_delta"] = df_b.loc[common_idx, m] - df_a.loc[common_idx, m] return result.reset_index() + + +def summary_delta_overlap_stats(df_a: pd.DataFrame, df_b: pd.DataFrame) -> dict: + """Describe index overlap used by :func:`build_summary_delta` (same join-key rules).""" + if df_a is None or df_b is None: + return {"valid": False, "error": "Summary dataframe missing.", "key_cols": []} + if "id" not in df_a.columns or "id" not in df_b.columns: + return { + "valid": False, + "error": "Summary must include an `id` column for delta alignment.", + "key_cols": ["id"], + } + if "perception_label" in df_a.columns and "perception_label" in df_b.columns: + key_cols = ["id", "perception_label"] + else: + key_cols = ["id"] + for c in key_cols: + if c not in df_a.columns or c not in df_b.columns: + return { + "valid": False, + "error": f"Join needs column `{c}` in both summaries; one run is missing it.", + "key_cols": key_cols, + } + + idx_a = df_a.set_index(key_cols).index + idx_b = df_b.set_index(key_cols).index + common = idx_a.intersection(idx_b) + only_a = idx_a.difference(idx_b) + only_b = idx_b.difference(idx_a) + + def _sample(idx_diff: pd.Index, k: int = 5) -> list[str]: + if len(idx_diff) == 0: + return [] + out: list[str] = [] + for x in list(idx_diff)[:k]: + if isinstance(x, tuple): + out.append(", ".join(str(p) for p in x)) + else: + out.append(str(x)) + return out + + return { + "valid": True, + "key_cols": key_cols, + "n_rows_baseline": int(len(df_a)), + "n_rows_candidate": int(len(df_b)), + "n_matched_keys": int(len(common)), + "n_only_baseline": int(len(only_a)), + "n_only_candidate": int(len(only_b)), + "sample_only_baseline": _sample(only_a), + "sample_only_candidate": _sample(only_b), + "matched_empty": len(common) == 0, + } diff --git a/evaluation_dashboard_app/lib/t4_dataset_embed.py b/evaluation_dashboard_app/lib/t4_dataset_embed.py new file mode 100644 index 0000000..c49d3bf --- /dev/null +++ b/evaluation_dashboard_app/lib/t4_dataset_embed.py @@ -0,0 +1,104 @@ +"""Build embeddable T4 dataset metadata: JSON records, query strings, and ``POST /render`` bodies. + +Use with :mod:`lib.t4_visualizer_client` when wiring eval parquet rows or dashboards to ``t4-server``. +""" + +from __future__ import annotations + +import json +from typing import Any, List, Mapping, Optional, Sequence +from urllib.parse import quote + +from lib.t4_visualizer_client import ( + RenderRequest, + TargetObjectIn, + render_request_to_json_body, + target_object_from_gt_row, +) + + +def t4_dataset_context( + t4dataset_id: str, + scenario_name: str, + *, + frame_index: Optional[int] = None, + data_dir: Optional[str] = None, + sample_token: Optional[str] = None, +) -> dict[str, Any]: + """Structured record for logging, sidecar JSON, or UI state.""" + out: dict[str, Any] = { + "t4dataset_id": t4dataset_id, + "scenario_name": scenario_name, + } + if frame_index is not None: + out["frame_index"] = int(frame_index) + if data_dir: + out["data_dir"] = data_dir + if sample_token: + out["sample_token"] = sample_token + return out + + +def t4_share_query_params( + t4dataset_id: str, + scenario_name: str, + frame_index: int = 0, +) -> str: + """Query string without leading ``?`` (for bookmarks or deep links).""" + return ( + f"t4dataset_id={quote(str(t4dataset_id), safe='')}" + f"&scenario_name={quote(str(scenario_name), safe='')}" + f"&frame_index={int(frame_index)}" + ) + + +def t4_share_query_params_from_post_render_json(body: Mapping[str, Any]) -> str: + """Query string (no ``?``) with a single ``render_json`` param: same object as curl ``-d`` / ``post_render_json``.""" + compact = json.dumps(dict(body), separators=(",", ":"), ensure_ascii=False) + return f"render_json={quote(compact, safe='')}" + + +def target_objects_from_rows(rows: Sequence[Mapping[str, Any]]) -> List[dict[str, Any]]: + """Map each row to a ``target_objects`` dict (see :func:`target_object_from_gt_row`).""" + return [target_object_from_gt_row(r) for r in rows] + + +def build_render_request_embed( + t4dataset_id: str, + scenario_name: str, + frame_index: int, + *, + target_rows: Optional[Sequence[Mapping[str, Any]]] = None, + target_objects: Optional[Sequence[TargetObjectIn]] = None, + show_annotations: bool = True, + crop_cameras: bool = False, + crop_padding: int = 40, + crop_min_size: int = 300, + cameras: Optional[List[str]] = None, + version: Optional[str] = None, +) -> dict[str, Any]: + """Return ``context`` plus a ``post_render_json`` body ready for ``POST /render``.""" + to_list: List[TargetObjectIn] = [] + if target_objects is not None: + to_list = list(target_objects) + elif target_rows is not None: + for r in target_rows: + d = target_object_from_gt_row(r) + to_list.append(TargetObjectIn(**d)) + req = RenderRequest( + t4dataset_id=t4dataset_id, + scenario_name=scenario_name, + frame_index=int(frame_index), + target_objects=to_list, + show_annotations=show_annotations, + crop_cameras=crop_cameras, + crop_padding=crop_padding, + crop_min_size=crop_min_size, + cameras=cameras, + version=version, + ) + body = render_request_to_json_body(req) + return { + "context": t4_dataset_context(t4dataset_id, scenario_name, frame_index=frame_index), + "post_render_json": body, + } diff --git a/evaluation_dashboard_app/lib/t4_three_layers.py b/evaluation_dashboard_app/lib/t4_three_layers.py new file mode 100644 index 0000000..54d7ced --- /dev/null +++ b/evaluation_dashboard_app/lib/t4_three_layers.py @@ -0,0 +1,260 @@ +"""T4 `/viewer/three` embed: GT / pred / matched 3D box layers via postMessage.""" + +from __future__ import annotations + +import html +import json +import math +from urllib.parse import urlencode +from typing import TYPE_CHECKING + +import streamlit.components.v1 as components + +if TYPE_CHECKING: + import pandas as pd + + +_OPTIONAL_NUMERIC_FIELDS = ( + "vx", + "vy", + "confidence", + "pointcloud_num", + "x_error", + "y_error", + "z_error", + "yaw_error", + "vx_error", + "vy_error", + "speed_error", + "center_distance", + "plane_distance", + "pair_dt_sec", + "dx_min", + "dy_min", + "unix_time", + "frame_index", +) + +_OPTIONAL_TEXT_FIELDS = ( + "frame_id", + "shape_type", + "visibility", + "pair_uuid", + "topic_name", + "t4dataset_id", + "suite_name", + "t4dataset_name", + "scenario_name", + "run", + "source", +) + +_VEHICLE_LABELS = {"car", "truck", "bus", "trailer"} +_LEGACY_EXTERNAL_BBOX_YAW_OFFSET = math.pi / 2 + + +def _is_missing(value: object) -> bool: + if value is None: + return True + if isinstance(value, float) and math.isnan(value): + return True + return bool(value != value) + + +def resolve_t4_dataset_id(dff: "pd.DataFrame") -> str: + """Parquet **t4dataset_id** or **t4dataset_name** for the current frame (empty if missing).""" + if dff is None or dff.empty: + return "" + if "t4dataset_id" in dff.columns and dff["t4dataset_id"].notna().any(): + return str(dff["t4dataset_id"].dropna().astype(str).iloc[0]) + if "t4dataset_name" in dff.columns and dff["t4dataset_name"].notna().any(): + return str(dff["t4dataset_name"].dropna().iloc[0]) + return "" + + +def resolve_t4_scenario(dff: "pd.DataFrame", scenario_from_sidebar: str | None) -> str: + if scenario_from_sidebar is not None and str(scenario_from_sidebar).strip() != "": + return str(scenario_from_sidebar) + if dff is not None and not dff.empty and "scenario_name" in dff.columns and dff["scenario_name"].notna().any(): + return str(dff["scenario_name"].dropna().iloc[0]) + return "" + + +def infer_external_bbox_alignment_query_params(df: "pd.DataFrame") -> str: + """Return `/viewer/three` query params for eval bbox dimension/yaw convention. + + Older eval parquet exports often store vehicle dimensions as width-forward + (`length < width`) and rely on the T4 viewer's legacy `+pi/2` external bbox + yaw offset. Newer app/analyzer exports store body-x as `length` and body-y as + `width`; those must pass `external_bbox_yaw_offset=0` or the viewer rotates + them by 90 degrees. + """ + if df is None or df.empty or not {"length", "width"}.issubset(df.columns): + yaw_offset = _LEGACY_EXTERNAL_BBOX_YAW_OFFSET + else: + sample = df + if "label" in sample.columns: + labels = sample["label"].astype(str).str.lower() + vehicle_sample = sample[labels.isin(_VEHICLE_LABELS)] + if not vehicle_sample.empty: + sample = vehicle_sample + if "source" in sample.columns: + gt_sample = sample[sample["source"].astype(str) == "GT"] + if not gt_sample.empty: + sample = gt_sample + + dims = sample[["length", "width"]].apply(lambda s: s.astype(float), axis=0) + dims = dims[(dims["length"] > 0) & (dims["width"] > 0)] + if dims.empty: + yaw_offset = _LEGACY_EXTERNAL_BBOX_YAW_OFFSET + else: + length_forward_ratio = float((dims["length"] >= dims["width"]).mean()) + yaw_offset = 0.0 if length_forward_ratio >= 0.8 else _LEGACY_EXTERNAL_BBOX_YAW_OFFSET + + return urlencode( + { + "external_bbox_yaw_offset": f"{yaw_offset:.12g}", + "external_bbox_swap_lw": "false", + } + ) + + +def _single_frame_layer_dict(df_frame: "pd.DataFrame") -> dict: + """Per-frame gt / pred / matched_pairs (no ``type`` field); used by single- and all-frame payloads.""" + if df_frame is None or df_frame.empty: + return {"gt": [], "pred": [], "matched_pairs": []} + + def _row_to_box(row: "pd.Series") -> dict: + box = { + "x": float(row.get("x", 0.0) or 0.0), + "y": float(row.get("y", 0.0) or 0.0), + "z": float(row.get("z", 0.0) or 0.0), + "width": float(row.get("width", 0.0) or 0.0), + "length": float(row.get("length", 0.0) or 0.0), + "height": float(row.get("height", 1.5) or 1.5), + "yaw": float(row.get("yaw", 0.0) or 0.0), + "label": str(row.get("label", "") or ""), + "uuid": str(row.get("uuid", "") or ""), + "status": str(row.get("status", "") or ""), + } + for field in _OPTIONAL_NUMERIC_FIELDS: + if field in row.index: + value = row.get(field) + if not _is_missing(value): + box[field] = float(value) + for field in _OPTIONAL_TEXT_FIELDS: + if field in row.index: + value = row.get(field) + if not _is_missing(value): + box[field] = str(value) + return box + + gt_df = df_frame[df_frame["source"] == "GT"].copy() + pred_df = df_frame[df_frame["source"] == "EST"].copy() + gt_boxes = [_row_to_box(r) for _, r in gt_df.iterrows()] + pred_boxes = [_row_to_box(r) for _, r in pred_df.iterrows()] + + gt_tp_idx: dict[str, int] = {} + for i, b in enumerate(gt_boxes): + match_key = str(b.get("pair_uuid") or b.get("uuid") or "") + if b["status"] == "TP" and match_key: + gt_tp_idx.setdefault(match_key, i) + pred_tp_idx: dict[str, int] = {} + for i, b in enumerate(pred_boxes): + match_key = str(b.get("pair_uuid") or b.get("uuid") or "") + if b["status"] == "TP" and match_key: + pred_tp_idx.setdefault(match_key, i) + matched_pairs = [] + for match_key, gi in gt_tp_idx.items(): + pi = pred_tp_idx.get(match_key) + if pi is not None: + matched_pairs.append({"gt_idx": int(gi), "pred_idx": int(pi), "pair_uuid": match_key}) + + return { + "gt": gt_boxes, + "pred": pred_boxes, + "matched_pairs": matched_pairs, + } + + +def build_three_layer_payload(df_frame: "pd.DataFrame") -> dict: + """Build GT/Pred/Matched overlay payload for `/viewer/three` iframe (single frame).""" + if df_frame is None or df_frame.empty: + return {"type": "bbox_layers_clear"} + inner = _single_frame_layer_dict(df_frame) + return { + "type": "bbox_layers", + "gt": inner["gt"], + "pred": inner["pred"], + "matched_pairs": inner["matched_pairs"], + } + + +def build_three_layer_payload_all_frames(df: "pd.DataFrame") -> dict: + """Build payload with eval layers for every ``frame_index`` in *df* (viewer picks by internal frame).""" + if df is None or df.empty: + return {"type": "bbox_layers_by_frame", "frames": {}} + if "frame_index" not in df.columns: + return {"type": "bbox_layers_by_frame", "frames": {}} + frames: dict[str, dict] = {} + for fi, group in df.groupby("frame_index", sort=True): + try: + key = str(int(fi)) + except (TypeError, ValueError): + continue + frames[key] = _single_frame_layer_dict(group) + return {"type": "bbox_layers_by_frame", "frames": frames} + + +def render_t4_three_js_embed(viewer_three_url: str, layer_payload: dict, height: int = 700) -> None: + """Iframe to T4 three viewer + postMessage with bbox layer payload (GT, pred, matched pairs).""" + _payload_json = json.dumps(layer_payload, ensure_ascii=True) + _payload_b64 = _payload_json.encode("utf-8").hex() + _iframe_src = html.escape(viewer_three_url, quote=True) + components.html( + ( + f'' + "" + ), + height=height + 24, + scrolling=True, + ) diff --git a/evaluation_dashboard_app/lib/t4_visualizer_client.py b/evaluation_dashboard_app/lib/t4_visualizer_client.py new file mode 100644 index 0000000..85a96e7 --- /dev/null +++ b/evaluation_dashboard_app/lib/t4_visualizer_client.py @@ -0,0 +1,320 @@ +"""HTTP client for the T4 Visualizer FastAPI server (render_frame over HTTP). + +Default base URL: ``T4_VISUALIZER_BASE_URL`` environment variable, or ``http://127.0.0.1:8000``. + +Does not import t4_devkit or t4_visualizer; only uses ``requests`` against the server's +``GET /health``, ``GET /server/structure.json``, ``GET /datasets``, ``GET /datasets/{id}/availability``, +``GET /datasets/{id}/scenarios``, and ``POST /render`` endpoints. +""" + +from __future__ import annotations + +import base64 +import os +from dataclasses import asdict, dataclass, field +from typing import Any, List, Mapping, Optional, Tuple + +import requests + +DEFAULT_BASE_URL = "http://10.0.6.148:8000" +ENV_BASE_URL = "T4_VISUALIZER_BASE_URL" + + +class T4VisualizerError(Exception): + """Raised when the T4 visualizer HTTP API returns an error or invalid response.""" + + def __init__( + self, + message: str, + *, + status_code: Optional[int] = None, + response_text: str = "", + ) -> None: + super().__init__(message) + self.status_code = status_code + self.response_text = response_text + + +@dataclass +class TargetObjectIn: + """One object to draw on the render (matches server ``TargetObjectIn``).""" + + uuid: str = "" + x: float = 0.0 + y: float = 0.0 + z: float = 0.0 + label: str = "" + width: float = 0.0 + length: float = 0.0 + height: float = 0.0 + yaw: float = 0.0 + + +@dataclass +class RenderRequest: + """Request body for ``POST /render`` (matches server ``RenderRequest``).""" + + t4dataset_id: str + scenario_name: str + frame_index: int + target_objects: List[TargetObjectIn] = field(default_factory=list) + cameras: Optional[List[str]] = None + show_annotations: bool = True + version: Optional[str] = None + crop_cameras: bool = False + crop_padding: int = 40 + crop_min_size: int = 300 + + +@dataclass +class ImageResult: + """One rendered PNG in the response.""" + + label: str + png_base64: str + + +@dataclass +class RenderResult: + """Parsed ``POST /render`` JSON response.""" + + sample_token: str + timestamp_us: int + images: List[ImageResult] + raw_json: Optional[dict] = None + # Optional server-reported timings (newer t4-server JSON body) + elapsed_ms: Optional[float] = None + tier4_load_ms: Optional[float] = None + render_ms: Optional[float] = None + + def decode_png(self, label: str) -> bytes: + """Decode base64 PNG bytes for the image with the given label.""" + for img in self.images: + if img.label == label: + return base64.b64decode(img.png_base64) + raise KeyError(f"No image with label {label!r}") + + def decode_all_images(self) -> List[Tuple[str, bytes]]: + """Decode all images to ``(label, png_bytes)``.""" + return [(img.label, base64.b64decode(img.png_base64)) for img in self.images] + + +def render_response_json_for_debug( + data: Mapping[str, Any], *, max_b64_preview: int = 120 +) -> dict[str, Any]: + """Copy of a ``POST /render`` JSON object with ``png_base64`` truncated for UI/debug.""" + out: dict[str, Any] = dict(data) + imgs = out.get("images") + if not isinstance(imgs, list): + return out + trimmed: list[Any] = [] + for item in imgs: + if not isinstance(item, dict): + trimmed.append(item) + continue + row = dict(item) + b64 = row.get("png_base64") + if isinstance(b64, str) and len(b64) > max_b64_preview: + row["png_base64"] = f"{b64[:max_b64_preview]}…" + row["png_base64_len"] = len(b64) + trimmed.append(row) + out["images"] = trimmed + return out + + +def _default_base_url() -> str: + return os.environ.get(ENV_BASE_URL, DEFAULT_BASE_URL).rstrip("/") + + +def _serialize_target_object(o: TargetObjectIn) -> dict: + d = asdict(o) + return d + + +def render_request_to_json_body(req: RenderRequest) -> dict: + """Build a JSON-serializable dict for ``POST /render``.""" + out: dict = { + "t4dataset_id": req.t4dataset_id, + "scenario_name": req.scenario_name, + "frame_index": req.frame_index, + "target_objects": [_serialize_target_object(o) for o in req.target_objects], + "show_annotations": req.show_annotations, + "crop_cameras": req.crop_cameras, + "crop_padding": req.crop_padding, + "crop_min_size": req.crop_min_size, + } + if req.cameras is not None: + out["cameras"] = req.cameras + if req.version is not None: + out["version"] = req.version + return out + + +def target_object_from_gt_row(row: Mapping[str, Any]) -> dict: + """Map a GT / eval parquet row to one ``target_objects`` entry for ``RenderRequest``. + + Uses ``uuid`` or ``gt_uuid`` for the instance id; position from ``x``, ``y``, ``z``; + optional bbox fields default to ``0.0`` when missing. + """ + raw_id = row.get("uuid") + if raw_id is None or raw_id == "": + raw_id = row.get("gt_uuid") + uuid_str = "" if raw_id is None else str(raw_id) + + def _float(key: str, default: float = 0.0) -> float: + v = row.get(key) + if v is None: + return default + return float(v) + + return { + "uuid": uuid_str, + "x": _float("x"), + "y": _float("y"), + "z": _float("z"), + "label": str(row.get("label") or ""), + "width": _float("width"), + "length": _float("length"), + "height": _float("height"), + "yaw": _float("yaw"), + } + + +class T4VisualizerClient: + """Thin HTTP client for the T4 Visualizer server.""" + + def __init__( + self, + base_url: Optional[str] = None, + *, + timeout: float = 120.0, + session: Optional[requests.Session] = None, + ) -> None: + raw = base_url if base_url is not None else _default_base_url() + self.base_url = raw.rstrip("/") + self.timeout = timeout + self._session = session if session is not None else requests.Session() + + def _url(self, path: str) -> str: + if not path.startswith("/"): + path = "/" + path + return f"{self.base_url}{path}" + + def _raise_for_status(self, resp: requests.Response) -> None: + if resp.ok: + return + text = (resp.text or "")[:2000] + raise T4VisualizerError( + f"T4 visualizer HTTP {resp.status_code}: {text[:500]}", + status_code=resp.status_code, + response_text=text, + ) + + def health(self) -> dict: + """GET /health — status, ``service``, ``version``, ``data_dir_exists``, structure paths (newer servers).""" + resp = self._session.get(self._url("/health"), timeout=self.timeout) + print(resp.text) + self._raise_for_status(resp) + try: + return resp.json() + except ValueError as exc: + raise T4VisualizerError("Invalid JSON from /health") from exc + + def server_structure_json(self) -> dict: + """GET /server/structure.json — Mermaid source for the server internals plus cache/runtime meta.""" + to = min(30.0, float(self.timeout)) + resp = self._session.get(self._url("/server/structure.json"), timeout=to) + self._raise_for_status(resp) + try: + return resp.json() + except ValueError as exc: + raise T4VisualizerError("Invalid JSON from /server/structure.json") from exc + + def list_datasets(self) -> dict: + """GET /datasets — returns at least ``data_dir`` and ``datasets``.""" + resp = self._session.get(self._url("/datasets"), timeout=self.timeout) + self._raise_for_status(resp) + try: + return resp.json() + except ValueError as exc: + raise T4VisualizerError("Invalid JSON from /datasets") from exc + + def list_dataset_scenarios( + self, t4dataset_id: str, version: Optional[str] = None + ) -> dict: + """GET /datasets/{t4dataset_id}/scenarios — scene names and ``nbr_samples`` (frame counts). + + Response keys typically include ``t4dataset_id``, ``scenarios`` (list of dicts with + ``name``, ``token``, ``description``, ``nbr_samples``), and optional ``version``. + """ + from urllib.parse import quote + + tid = quote(str(t4dataset_id), safe="") + params = {"version": version} if version is not None else None + resp = self._session.get( + self._url(f"/datasets/{tid}/scenarios"), + params=params, + timeout=self.timeout, + ) + self._raise_for_status(resp) + try: + return resp.json() + except ValueError as exc: + raise T4VisualizerError("Invalid JSON from /datasets/.../scenarios") from exc + + def dataset_availability(self, t4dataset_id: str) -> dict: + """GET /datasets/{t4dataset_id}/availability — whether the dataset is on disk for this server. + + Typical JSON: ``t4dataset_id``, ``available`` (bool), ``dataset_path`` (str or null). + """ + from urllib.parse import quote + + tid = quote(str(t4dataset_id), safe="") + resp = self._session.get( + self._url(f"/datasets/{tid}/availability"), + timeout=self.timeout, + ) + self._raise_for_status(resp) + try: + return resp.json() + except ValueError as exc: + raise T4VisualizerError("Invalid JSON from /datasets/.../availability") from exc + + def render(self, payload: RenderRequest) -> RenderResult: + """POST /render with a :class:`RenderRequest`.""" + body = render_request_to_json_body(payload) + resp = self._session.post( + self._url("/render"), + json=body, + timeout=self.timeout, + ) + self._raise_for_status(resp) + try: + data = resp.json() + except ValueError as exc: + raise T4VisualizerError("Invalid JSON from /render") from exc + + try: + images_raw = data["images"] + imgs = [ + ImageResult(label=str(x["label"]), png_base64=str(x["png_base64"])) + for x in images_raw + ] + + def _opt_float(key: str) -> Optional[float]: + v = data.get(key) + if v is None: + return None + return float(v) + + return RenderResult( + sample_token=str(data["sample_token"]), + timestamp_us=int(data["timestamp_us"]), + images=imgs, + raw_json=dict(data), + elapsed_ms=_opt_float("elapsed_ms"), + tier4_load_ms=_opt_float("tier4_load_ms"), + render_ms=_opt_float("render_ms"), + ) + except (KeyError, TypeError, ValueError) as exc: + raise T4VisualizerError(f"Unexpected /render response shape: {data!r}") from exc diff --git a/evaluation_dashboard_app/lib/tlr_eval_analyzer.py b/evaluation_dashboard_app/lib/tlr_eval_analyzer.py index 437e8df..ac88053 100644 --- a/evaluation_dashboard_app/lib/tlr_eval_analyzer.py +++ b/evaluation_dashboard_app/lib/tlr_eval_analyzer.py @@ -14,6 +14,11 @@ import pandas as pd import numpy as np +try: + import yaml +except ImportError: # pragma: no cover - optional dependency fallback + yaml = None + def _obj_to_dict(obj: Any) -> Any: """Recursively convert an object to dict/list primitives for TLR frame structure.""" @@ -34,6 +39,8 @@ class TLREvaluationAnalyzer: def __init__(self, result_directory: str): self.result_directory = result_directory self.scenario_results: Dict[str, List[Dict]] = {} + self.scenario_paths: Dict[str, Path] = {} + self.scenario_metadata: Dict[str, Dict[str, str]] = {} self.criteria_data: Dict[str, Dict] = {} self.cached_vehicle_statuses: Dict[str, List[Dict]] = {} self.cached_traffic_light_data: Dict[str, List[Dict]] = {} @@ -58,18 +65,21 @@ def load_all_results_from_pkl(self) -> None: frames = self._load_pkl_scenario(child) if frames: self.scenario_results[child.name] = frames + self.scenario_paths[child.name] = child # Also support flat layout: root contains *.pkl.z (e.g. archive) — each file = one scenario if not self.scenario_results: for pkl_path in root.glob("*.pkl.z"): frames = self._load_single_pkl_file(pkl_path) if frames: self.scenario_results[pkl_path.stem] = frames + self.scenario_paths[pkl_path.stem] = pkl_path.parent for pkl_path in root.glob("*.pkl"): if pkl_path.name == "scene_result.pkl": continue # already handled as child/scene_result.pkl frames = self._load_single_pkl_file(pkl_path) if frames: self.scenario_results[pkl_path.stem] = frames + self.scenario_paths[pkl_path.stem] = pkl_path.parent def _load_pkl_scenario(self, scenario_path: Path) -> List[Dict]: """Load one scenario dir: scene_result.pkl or first .pkl.z in that dir.""" @@ -160,6 +170,7 @@ def load_all_results_from_json(self) -> None: if result_file.exists(): # Flat: direct child has result.json self.scenario_results[child.name] = self._load_result_jsonl(os.fspath(result_file)) + self.scenario_paths[child.name] = child else: # Suite: child is a suite folder; look for testcase subdirs with result.json for testcase_dir in child.iterdir(): @@ -169,6 +180,7 @@ def load_all_results_from_json(self) -> None: if tc_result.exists(): scenario_key = f"{child.name}/{testcase_dir.name}" self.scenario_results[scenario_key] = self._load_result_jsonl(os.fspath(tc_result)) + self.scenario_paths[scenario_key] = testcase_dir def _load_result_jsonl(self, file_path: str) -> List[Dict]: """Load and parse result.json (JSONL format).""" @@ -664,12 +676,51 @@ def _matches_criteria_range_critical_priority(self, criteria_range: str, tlr_typ return True return False + def _get_scenario_metadata(self, scenario_name: str) -> Dict[str, str]: + """Load lightweight scenario metadata such as t4dataset_id from scenario.yaml.""" + if scenario_name in self.scenario_metadata: + return self.scenario_metadata[scenario_name] + + metadata: Dict[str, str] = {"t4dataset_id": ""} + scenario_path = self.scenario_paths.get(scenario_name) + scenario_yaml_path = None + if scenario_path: + direct_yaml_path = scenario_path / "scenario.yaml" + if direct_yaml_path.is_file(): + scenario_yaml_path = direct_yaml_path + else: + # Some TLR layouts store result.json in a suffixed sibling dir + # (for example ``ScenarioName_4038db04``) while ``scenario.yaml`` + # lives in the unsuffixed directory next to it. + base_name = scenario_path.name + if re.fullmatch(r".*_[0-9a-fA-F]{8}", base_name): + sibling_yaml_path = scenario_path.parent / base_name.rsplit("_", 1)[0] / "scenario.yaml" + if sibling_yaml_path.is_file(): + scenario_yaml_path = sibling_yaml_path + if scenario_yaml_path and scenario_yaml_path.is_file() and yaml is not None: + try: + with open(scenario_yaml_path, "r", encoding="utf-8") as f: + scenario_doc = yaml.safe_load(f) or {} + datasets = scenario_doc.get("Evaluation", {}).get("Datasets", []) + if isinstance(datasets, list): + dataset_ids = [] + for item in datasets: + if isinstance(item, dict): + dataset_ids.extend(str(k) for k in item.keys()) + metadata["t4dataset_id"] = ", ".join(dataset_ids) + except (OSError, yaml.YAMLError): + pass + + self.scenario_metadata[scenario_name] = metadata + return metadata + def get_vehicle_status_details_df(self) -> pd.DataFrame | None: """Return a DataFrame of per-frame vehicle status and TLR info for all scenarios.""" all_status_data = [] for scenario_name, results in self.scenario_results.items(): if not results: continue + scenario_metadata = self._get_scenario_metadata(scenario_name) vehicle_statuses = self.cached_vehicle_statuses.get(scenario_name) traffic_light_data = self.cached_traffic_light_data.get(scenario_name) if not vehicle_statuses or not traffic_light_data: @@ -678,6 +729,7 @@ def get_vehicle_status_details_df(self) -> pd.DataFrame | None: for i, (frame_status_info, tlr_info) in enumerate(zip(vehicle_statuses, traffic_light_data)): all_status_data.append({ "scenario": scenario_name, + "t4dataset_id": scenario_metadata.get("t4dataset_id", ""), "frame_index": i, "frame_name": tlr_info.get("frame", ""), "status": frame_status_info["status"], diff --git a/evaluation_dashboard_app/lib/ui/__init__.py b/evaluation_dashboard_app/lib/ui/__init__.py index 6bae170..762606b 100644 --- a/evaluation_dashboard_app/lib/ui/__init__.py +++ b/evaluation_dashboard_app/lib/ui/__init__.py @@ -35,6 +35,8 @@ ) from lib.ui.styles_download import inject_download_page_styles from lib.ui.styles_global import inject_app_page_styles +from lib.ui.task_history import get_task_list_current_user, render_task_detail_content, render_task_list +from lib.ui.task_result_summary import render_summary_table, render_task_result_summary __all__ = [ "ImpressiveProgressHUD", @@ -62,6 +64,11 @@ "render_job_json_summary_panel", "render_recent_scenario_downloads_intro", "render_scenario_download_summary_panel", + "get_task_list_current_user", + "render_summary_table", + "render_task_detail_content", + "render_task_list", + "render_task_result_summary", "render_kpi_card", "section_header_html", ] diff --git a/evaluation_dashboard_app/lib/ui/detection_stats.py b/evaluation_dashboard_app/lib/ui/detection_stats.py index 780d245..b4d0f1d 100644 --- a/evaluation_dashboard_app/lib/ui/detection_stats.py +++ b/evaluation_dashboard_app/lib/ui/detection_stats.py @@ -2,7 +2,6 @@ from __future__ import annotations -import html from contextlib import contextmanager import streamlit as st @@ -272,25 +271,15 @@ def section_header_html(title: str, caption: str = "") -> str: return f'
{title}
' -def ds_spot_loading_markup(label: str) -> str: - """Compact inline HTML: shows where the app is busy (Streamlit runs top-to-bottom, so this “moves” down the page).""" - safe = html.escape(label) - return f"""
- - Working here - {safe} - -
""" +def ds_spot_loading_markup(_label: str) -> str: + """Spot loader HTML disabled (was: “Working here” + label); returns empty string.""" + return "" @contextmanager -def ds_spot_loading(label: str): - slot = st.empty() - slot.markdown(ds_spot_loading_markup(label), unsafe_allow_html=True) - try: - yield - finally: - slot.empty() +def ds_spot_loading(_label: str): + """Spot loader context manager disabled (no-op); kept for call-site compatibility.""" + yield def detection_stats_page_loading_banner_markup() -> str: """Top-of-page banner while queries and charts stream in.""" diff --git a/evaluation_dashboard_app/lib/ui/download_ui.py b/evaluation_dashboard_app/lib/ui/download_ui.py index 100d89f..28c6c23 100644 --- a/evaluation_dashboard_app/lib/ui/download_ui.py +++ b/evaluation_dashboard_app/lib/ui/download_ui.py @@ -127,15 +127,15 @@ def render_download_hero(*, queue_enabled: bool) -> None: ) -def render_download_task_section_header(*, since_days: int = 7, max_rows: int = 200) -> None: +def render_download_task_section_header(*, since_days: Optional[int] = 7, max_rows: int = 200) -> None: """Lightweight title for the worker task list (no extra card chrome — task rows are the cards).""" - days = int(since_days) cap = int(max_rows) st.subheader("Recent tasks") - st.caption( - f"Queued/running jobs below; completed or failed in **Task history**. " - f"Last **{days}** days, up to **{cap}** rows." - ) + if since_days is None: + window = "All time" + else: + window = f"Last **{int(since_days)}** days" + st.caption(f"Queued/running jobs below; completed or failed in **Task history**. {window}, up to **{cap}** rows.") def _coerce_progress_fraction(progress_pct: Optional[Any]) -> Optional[float]: diff --git a/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py new file mode 100644 index 0000000..933f040 --- /dev/null +++ b/evaluation_dashboard_app/lib/ui/recent_evaluator_jobs.py @@ -0,0 +1,2087 @@ +"""Shared Recent Evaluator Jobs UI.""" + +from __future__ import annotations + +import html +import os +import urllib.parse +from concurrent.futures import ThreadPoolExecutor, as_completed +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional + +import pandas as pd +import requests +import streamlit as st + +from lib import evaluator_api +from lib.path_utils import resolve_under_data_root, to_data_relative + +_JST = timezone(timedelta(hours=9)) +_CONFIG_GETTER: Callable[[str, Any], Any] = lambda key, default=None: default +_CONFIG_SETTER: Callable[[str, Any], None] = lambda key, value: None +_ENQUEUE_TASK: Callable[[str, Dict[str, Any]], Optional[str]] = lambda task_type, params: None +CATALOG_IO_AVAILABLE = False +ENVIRONMENT = "default" +_DEFAULT_EVAL_WORKERS = 4 + + +def _default_eval_workers() -> int: + try: + workers = int(os.environ.get("EVAL_WORKERS_DEFAULT", _DEFAULT_EVAL_WORKERS)) + except (TypeError, ValueError): + workers = _DEFAULT_EVAL_WORKERS + return max(1, min(workers, 16)) + + +def configure_recent_evaluator_jobs_ui(*, get_config_value: Callable[[str, Any], Any], set_config_value: Callable[[str, Any], None], enqueue_task: Callable[[str, Dict[str, Any]], Optional[str]], catalog_io_available: bool, environment: str = "default") -> None: + global _CONFIG_GETTER, _CONFIG_SETTER, _ENQUEUE_TASK, CATALOG_IO_AVAILABLE, ENVIRONMENT + _CONFIG_GETTER = get_config_value + _CONFIG_SETTER = set_config_value + _ENQUEUE_TASK = enqueue_task + CATALOG_IO_AVAILABLE = bool(catalog_io_available) + ENVIRONMENT = environment or "default" + + +def get_config_value(key: str, default: Any = None) -> Any: + return _CONFIG_GETTER(key, default) + + +def set_config_value(key: str, value: Any) -> None: + _CONFIG_SETTER(key, value) + + +def _enqueue_task(task_type: str, params: Dict[str, Any]) -> Optional[str]: + return _ENQUEUE_TASK(task_type, params) + + +def _friendly_request_error_message(exc: Exception) -> str: + text = str(exc or "").strip() + lowered = text.lower() + if "temporary failure in name resolution" in lowered or "failed to resolve" in lowered or "name resolution" in lowered: + return "Could not load evaluator jobs because the network appears to be unavailable." + if "auth.web.auto" in lowered or "/token" in lowered: + return "Could not load evaluator jobs because the sign-in service is currently unavailable." + if "connection refused" in lowered or "max retries exceeded" in lowered or "newconnectionerror" in lowered: + return "Could not connect to the evaluator service right now. Please try again in a moment." + if "timed out" in lowered or "timeout" in lowered: + return "Loading evaluator jobs took too long. Please try again." + return "Could not load evaluator jobs right now. Please check the network connection and try again." + + +def _load_catalog_presets() -> List[Dict[str, str]]: + """Load catalog presets from the app-level catalogs.json file if available.""" + app_root = Path(__file__).resolve().parents[2] + search_paths = [ + app_root / "catalogs.json", + Path(os.environ.get("CATALOGS_PATH", "")), + Path.cwd() / "catalogs.json", + ] + for path in search_paths: + if not path or not str(path): + continue + try: + if not path.exists() or not path.is_file(): + continue + import json + + with path.open("r", encoding="utf-8") as handle: + data = json.load(handle) + raw_catalogs = data.get("catalogs", []) if isinstance(data, dict) else data + presets: List[Dict[str, str]] = [] + for item in raw_catalogs or []: + if not isinstance(item, dict): + continue + display_name = ( + str(item.get("display_name") or item.get("name") or item.get("catalog_id") or "") + .strip() + ) + if not display_name: + continue + presets.append({**item, "display_name": display_name}) + return presets + except Exception: + continue + return [] + + +def _retest_catalog_emoji(preset_name: str, *, has_custom_catalog: bool = False) -> str: + mapping = { + "Build Test Catalog": "🛠️", + "Performance Test": "📈", + "Old performance test": "🕰️", + "Devops Test": "⚙️", + "Usecase Performance Catalog": "🧭", + "L4 regression test": "⚠️", + } + normalized = str(preset_name or "").strip() + if normalized in mapping: + return mapping[normalized] + if has_custom_catalog: + return "🧩" + return "📦" + + +def _make_retest_description(target_name: str, preset_name: str = "", *, has_custom_catalog: bool = False) -> str: + clean_target = " ".join(str(target_name or "").strip().split()) or "artifact" + stamp = datetime.now().strftime("%m-%d %H:%M") + return ( + f"♻️ evaluator artifact retest [{clean_target}] [{stamp}] " + f"{_retest_catalog_emoji(preset_name, has_custom_catalog=has_custom_catalog)}" + ) + + +def _retest_suite_selection_key(job_id: str) -> str: + return f"recent_eval_retest_suite_selection_{job_id}" + + +def _to_jst(dt: Any) -> Optional[datetime]: + if dt is None: + return None + if not hasattr(dt, "astimezone"): + return None + try: + if getattr(dt, "tzinfo", None) is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(_JST) + except Exception: + return None + +def _parse_api_dt(value: Any) -> Optional[datetime]: + """Parse evaluator API timestamps into timezone-aware datetimes.""" + if value is None: + return None + if isinstance(value, datetime): + if getattr(value, "tzinfo", None) is None: + return value.replace(tzinfo=timezone.utc) + return value + try: + text = str(value).strip() + if not text: + return None + if text.endswith("Z"): + text = text[:-1] + "+00:00" + dt = datetime.fromisoformat(text) + if getattr(dt, "tzinfo", None) is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except Exception: + return None + + +def _format_jst_time(value: Any, *, include_seconds: bool = False) -> str: + """Format timestamps for display in JST.""" + dt = _to_jst(_parse_api_dt(value)) + if not dt: + return "—" + return dt.strftime("%Y-%m-%d %H:%M:%S JST" if include_seconds else "%Y-%m-%d %H:%M JST") + + +def _format_jst_time_compact(value: Any) -> str: + """Compact timestamp for dense recent-job rows.""" + dt = _to_jst(_parse_api_dt(value)) + if not dt: + return "—" + return dt.strftime("%m-%d %H:%M") + + +def _format_jst_time_title(value: Any) -> str: + """Readable timestamp for fallback job titles.""" + dt = _to_jst(_parse_api_dt(value)) + if not dt: + return "unknown time" + return f"{dt.year}/{dt.month}/{dt.day} {dt.hour}:{dt.minute:02d}:{dt.second:02d}" + + +def _format_relative_time(value: Any) -> str: + """Human-friendly age/duration from a timestamp until now.""" + dt = _parse_api_dt(value) + if not dt: + return "—" + now = datetime.now(timezone.utc) + secs = max(0, int((now - dt.astimezone(timezone.utc)).total_seconds())) + if secs < 60: + return f"{secs}s ago" + if secs < 3600: + return f"{secs // 60}m ago" + if secs < 86400: + return f"{secs // 3600}h ago" + return f"{secs // 86400}d ago" + + +def _format_duration(start_value: Any, end_value: Any) -> str: + """Format elapsed duration between two evaluator timestamps.""" + start = _parse_api_dt(start_value) + end = _parse_api_dt(end_value) + if not start or not end: + return "—" + secs = max(0, int((end - start).total_seconds())) + if secs < 60: + return f"{secs}s" + if secs < 3600: + return f"{secs // 60}m {secs % 60}s" + return f"{secs // 3600}h {(secs % 3600) // 60}m" + + +def _extract_git_target(report: Dict[str, Any]) -> str: + """Return a compact branch/tag label from evaluator job report metadata.""" + source = ((report.get("event") or {}).get("source") or {}) + git_ref = str(source.get("git_ref") or "").strip() + if git_ref.startswith("refs/heads/"): + return git_ref[len("refs/heads/"):] + if git_ref.startswith("refs/tags/"): + return git_ref[len("refs/tags/"):] + return git_ref or str(source.get("git_sha") or "").strip()[:12] or "—" + + +def _extract_catalog_url(report: Dict[str, Any]) -> str: + """Return a best-effort catalog URL for linking from recent evaluator jobs.""" + catalog = report.get("catalog") or {} + direct_url = str( + catalog.get("web_url") + or catalog.get("url") + or catalog.get("catalog_url") + or "" + ).strip() + if direct_url: + return direct_url + + project_id = str(report.get("project_id") or "").strip() + catalog_id = str( + catalog.get("catalog_id") + or catalog.get("id") + or "" + ).strip() + if project_id and catalog_id: + return f"https://evaluation.tier4.jp/evaluation/vehicle_catalogs/{catalog_id}?project_id={project_id}" + return "" + + +def _extract_job_title(report: Dict[str, Any]) -> str: + """Prefer evaluator description for display title, with a readable fallback.""" + description = str(report.get("description") or "").strip() + if description: + return description + started_like = report.get("started_at") or report.get("scheduled_at") or report.get("finished_at") + return f"no description (Started at {_format_jst_time_title(started_like)})" + + +def _extract_case_totals(report: Dict[str, Any]) -> Dict[str, int]: + """Return total/success/failed/canceled counts from job report.""" + test = report.get("test") or {} + result = test.get("available_case_results") or test.get("case_results") or {} + return { + "total": int(result.get("total_count", 0) or 0), + "success": int(result.get("success_count", 0) or 0), + "failed": int(result.get("failure_count", 0) or 0), + "canceled": int(result.get("cancellation_count", 0) or 0), + } + + +def _extract_failed_case_rows(case_reports: List[Dict[str, Any]], *, limit: int = 50) -> List[Dict[str, Any]]: + """Normalize failed case rows for display tables.""" + rows: List[Dict[str, Any]] = [] + for report in case_reports: + status = str(report.get("status") or "").strip().lower() + result_status = str(((report.get("result") or {}).get("status") or "")).strip().lower() + if status not in evaluator_api.FAILED_JOB_STATUSES and result_status not in evaluator_api.FAILED_JOB_STATUSES: + continue + logs = report.get("logs") or {} + rows.append( + { + "Suite": ((report.get("suite") or {}).get("display_name") or ""), + "Scenario": ((report.get("scenario") or {}).get("display_name") or ""), + "Status": report.get("status", ""), + "Fail message": report.get("fail_message", ""), + "Cause": ", ".join(report.get("failure_cause_labels", []) or []), + "Archive log": "yes" if ((logs.get("simulation_archive") or {}).get("id")) else "no", + "Result JSON": "yes" if ((logs.get("simulation_result_json") or {}).get("id")) else "no", + } + ) + rows.sort(key=lambda row: (row["Suite"], row["Scenario"], row["Fail message"])) + return rows[:limit] + + +def _extract_suite_rows(suite_rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Normalize suite summary rows for display tables.""" + rows = [ + { + "Suite": row.get("name", ""), + "Total": int(row.get("all", 0) or 0), + "Success": int(row.get("success", 0) or 0), + "Failed": int(row.get("fail", 0) or 0), + "Canceled": int(row.get("cancel", 0) or 0), + "Simulation": row.get("simulation", ""), + "Report": row.get("url", ""), + } + for row in suite_rows or [] + ] + rows.sort(key=lambda row: (-row["Failed"], row["Suite"])) + return rows + + +def _extract_suite_selection_options(suite_rows: List[Dict[str, Any]]) -> List[Dict[str, str]]: + """Build suite picker options from evaluator suite summary rows.""" + options: List[Dict[str, str]] = [] + seen_ids = set() + for row in suite_rows or []: + report_url = str(row.get("url") or row.get("Report") or "").strip() + suite_id = "" + if "/tests/" in report_url: + tail = report_url.split("/tests/", 1)[1] + suite_id = tail.split("?", 1)[0].split("/", 1)[0].strip() + if not suite_id or suite_id in seen_ids: + continue + seen_ids.add(suite_id) + suite_name = str(row.get("name") or row.get("Suite") or suite_id).strip() + options.append({"id": suite_id, "label": f"{suite_name} ({suite_id})"}) + return options + + +def _short_git_sha(sha: str, *, length: int = 8) -> str: + return str(sha or "").strip()[: max(1, int(length))] + + +def _format_source_ref_text(source_label: str, git_sha: str) -> str: + label = str(source_label or "").strip() + short_sha = _short_git_sha(git_sha) + if label and short_sha: + return f"{label} ({short_sha})" + return label or short_sha or "—" + + +def _format_source_ref_html( + source_label: str, + source_url: str, + git_sha: str, + git_commit_url: str, +) -> str: + label = html.escape(str(source_label or "").strip() or "—") + ref_url = html.escape(str(source_url or "").strip()) + short_sha = html.escape(_short_git_sha(git_sha)) + commit_url = html.escape(str(git_commit_url or "").strip()) + + if ref_url and label != "—": + label_html = f'{label}' + else: + label_html = label + + if short_sha: + sha_html = ( + f'{short_sha}' + if commit_url + else short_sha + ) + if label_html and label_html != "—": + return f"{label_html} ({sha_html})" + return sha_html + + return label_html + + +def _extract_retest_parent_job_id(report: Dict[str, Any]) -> str: + """Return the upstream source_job_id when this evaluator job was itself a retest.""" + event = report.get("event") or {} + candidates = ( + event.get("source_job_id"), + ((event.get("source_job") or {}).get("id") if isinstance(event.get("source_job"), dict) else ""), + report.get("source_job_id"), + ) + for candidate in candidates: + value = str(candidate or "").strip() + if value: + return value + return "" + + +def _resolve_retest_source_job_id( + project_id: str, + environment: str, + job_id: str, + *, + detail: Optional[Dict[str, Any]] = None, + max_depth: int = 5, +) -> str: + """Unwrap retest chains so scheduling reuses the earliest known source job.""" + current_job_id = str(job_id or "").strip() + current_detail = detail or {} + seen_job_ids: set[str] = set() + + while current_job_id and current_job_id not in seen_job_ids and len(seen_job_ids) < max_depth: + seen_job_ids.add(current_job_id) + raw_report = current_detail.get("raw_report") if isinstance(current_detail, dict) else {} + parent_job_id = _extract_retest_parent_job_id(raw_report or {}) + if not parent_job_id or parent_job_id in seen_job_ids: + return current_job_id + current_job_id = parent_job_id + try: + current_detail = _fetch_evaluator_job_detail(project_id, environment, current_job_id) + except Exception: + return current_job_id + + return current_job_id or str(job_id or "").strip() + + +def _status_color_variant(status: str) -> str: + """Map evaluator status to a style token used by the recent-job cards.""" + normalized = evaluator_api.normalize_job_status(status) + if normalized in evaluator_api.SUCCESS_JOB_STATUSES: + return "success" + if normalized in ("canceled", "cancelled", "aborted"): + return "canceled" + if normalized in evaluator_api.FAILED_JOB_STATUSES: + return "failed" + if normalized in ("started", "running", "pending", "queued", "created"): + return "running" + return "unknown" + + +def _status_display_label(status: str) -> str: + """Short status label for compact list rows.""" + normalized = evaluator_api.normalize_job_status(status) + if normalized in ("succeeded", "success"): + return "success" + if normalized in ("failed", "failure", "error"): + return "failed" + if normalized in ("canceled", "cancelled", "aborted"): + return "canceled" + if normalized in ("started", "running"): + return "running" + if normalized in ("pending", "queued", "created"): + return "queued" + return normalized or "unknown" + + +def _status_filter_values(selected_statuses: List[str]) -> List[str]: + """Normalize UI status filters into API status values.""" + values: List[str] = [] + for raw in selected_statuses: + normalized = evaluator_api.normalize_job_status(raw) + if normalized == "unknown" or not normalized: + continue + if normalized == "running": + values.extend(["running", "started"]) + elif normalized == "success": + values.extend(["success", "succeeded"]) + elif normalized == "failed": + values.extend(["failed", "failure", "error"]) + elif normalized == "canceled": + values.extend(["canceled", "cancelled", "aborted"]) + else: + values.append(normalized) + return sorted(set(values)) + + +def _escape_search_match_value(value: str) -> str: + """Escape wildcard characters for API Match filters.""" + return ( + value.replace("\\", "\\\\") + .replace("*", "\\*") + .replace("?", "\\?") + ) + + +def _build_recent_job_search_filter( + search_text: str, + search_scope: str, + user_directory: Optional[Dict[str, Dict[str, str]]] = None, +) -> tuple[Optional[Dict[str, Any]], str]: + """Map quick-search UI to one server-side filter and a client-side needle.""" + needle = search_text.strip() + if not needle: + return None, "" + + if search_scope == "Branch/tag": + return ( + { + "field": "event.source.git_ref", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + if search_scope == "Description": + return ( + { + "field": "description", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + if search_scope == "Job ID": + return ( + { + "field": "job_id", + "operator": "In", + "values": [needle], + }, + needle.lower(), + ) + if search_scope == "Git SHA": + return ( + { + "field": "event.source.git_sha", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + if search_scope == "Fail message": + return ( + { + "field": "fail_message", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + return None, needle.lower() + + +def _recent_job_search_history_key(scope: str) -> str: + return f"recent_eval_jobs_search_history::{scope}" + + +def _get_recent_job_search_history(scope: str) -> List[str]: + stored = get_config_value(_recent_job_search_history_key(scope), []) or [] + if not isinstance(stored, list): + return [] + return [str(v).strip() for v in stored if str(v).strip()] + + +def _save_recent_job_search_history(scope: str, value: str, *, max_items: int = 8) -> None: + text = str(value).strip() + if not text: + return + history = _get_recent_job_search_history(scope) + updated = [text] + [item for item in history if item != text] + set_config_value(_recent_job_search_history_key(scope), updated[:max_items]) + + +def _get_recent_eval_user_directory() -> Dict[str, Dict[str, str]]: + stored = get_config_value("recent_eval_jobs_user_directory", {}) or {} + if not isinstance(stored, dict): + return {} + normalized: Dict[str, Dict[str, str]] = {} + for subject_id, info in stored.items(): + if not isinstance(info, dict): + continue + normalized[str(subject_id)] = { + "name": str(info.get("name") or "").strip(), + "email": str(info.get("email") or "").strip(), + "subject_id": str(info.get("subject_id") or subject_id).strip(), + } + return normalized + + +def _save_recent_eval_user_directory(directory: Dict[str, Dict[str, str]]) -> None: + set_config_value("recent_eval_jobs_user_directory", directory) + + +@st.cache_data(ttl=24 * 3600, show_spinner=False) +def _fetch_auth_member_profile(subject_id: str, environment: str) -> Dict[str, str]: + subject = str(subject_id or "").strip() + if not subject: + return {} + org_id = os.environ.get( + "WEBAUTO_ORGANIZATION_ID", + "5a21621d-6968-4f7d-94f8-99cfb77b6e71", + ).strip() + if not org_id: + return {"subject_id": subject, "name": subject, "email": ""} + os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT + from webautoauth.token import HttpService, TokenSource, load_config + + config = load_config() + token_source = TokenSource(HttpService(config)) + access_token = token_source.get_token().access_token + quoted_subject = urllib.parse.quote(subject, safe="") + url = f"https://auth.web.auto/v2/organizations/{org_id}/members/{quoted_subject}" + response = requests.get( + url, + headers={"Authorization": f"Bearer {access_token}", "accept": "application/json"}, + timeout=10, + ) + response.raise_for_status() + data = response.json() + return { + "subject_id": str(data.get("subject_id") or subject), + "name": str(data.get("name") or subject).strip(), + "email": str(data.get("email") or "").strip(), + } + + +def _hydrate_recent_eval_user_directory( + jobs: List[Dict[str, Any]], + environment: str, +) -> Dict[str, Dict[str, str]]: + directory = _get_recent_eval_user_directory() + unresolved = sorted( + { + str(job.get("scheduled_by") or "").strip() + for job in jobs + if str(job.get("scheduled_by") or "").strip() + and str(job.get("scheduled_by") or "").strip() not in directory + } + ) + if not unresolved: + return directory + + updates: Dict[str, Dict[str, str]] = {} + with ThreadPoolExecutor(max_workers=min(6, len(unresolved))) as executor: + future_map = { + executor.submit(_fetch_auth_member_profile, subject_id, environment): subject_id + for subject_id in unresolved + } + for future in as_completed(future_map): + subject_id = future_map[future] + try: + profile = future.result() + except Exception: + profile = { + "subject_id": subject_id, + "name": subject_id, + "email": "", + } + updates[subject_id] = { + "subject_id": str(profile.get("subject_id") or subject_id).strip(), + "name": str(profile.get("name") or subject_id).strip(), + "email": str(profile.get("email") or "").strip(), + } + + if updates: + directory = {**directory, **updates} + _save_recent_eval_user_directory(directory) + return directory + + +def _build_recent_job_date_filters( + date_from: Optional[datetime.date], + date_to: Optional[datetime.date], +) -> List[Dict[str, Any]]: + """Build scheduled_at date-range filters for the search API.""" + filters: List[Dict[str, Any]] = [] + if date_from: + start_dt = datetime(date_from.year, date_from.month, date_from.day, 0, 0, 0, tzinfo=_JST) + filters.append( + { + "field": "scheduled_at", + "operator": "Gte", + "values": [start_dt.astimezone(timezone.utc).isoformat()], + } + ) + if date_to: + end_dt = datetime(date_to.year, date_to.month, date_to.day, 23, 59, 59, tzinfo=_JST) + filters.append( + { + "field": "scheduled_at", + "operator": "Lte", + "values": [end_dt.astimezone(timezone.utc).isoformat()], + } + ) + return filters + + +def _summarize_recent_job(report: Dict[str, Any]) -> Dict[str, Any]: + """Compact summary for one evaluator job card.""" + status = evaluator_api.extract_job_status(report) + totals = _extract_case_totals(report) + source = ((report.get("event") or {}).get("source") or {}) + git_url = str(source.get("git_web_url") or source.get("git_url") or "").strip() + source_repo_label = git_url.rstrip("/").split("/")[-1] if git_url else "—" + git_ref_label = _extract_git_target(report) + return { + "job_id": report.get("job_id") or report.get("id") or "", + "title": _extract_job_title(report), + "status": status, + "status_variant": _status_color_variant(status), + "build_status": ((report.get("build") or {}).get("status") or ""), + "test_status": ((report.get("test") or {}).get("status") or ""), + "target": git_ref_label, + "catalog": ((report.get("catalog") or {}).get("display_name") or ""), + "catalog_url": _extract_catalog_url(report), + "description": report.get("description", ""), + "source_label": git_ref_label, + "source_repo_label": source_repo_label, + "scheduled_at": report.get("scheduled_at"), + "started_at": report.get("started_at"), + "finished_at": report.get("finished_at"), + "duration": _format_duration(report.get("started_at"), report.get("finished_at")), + "created_label": _format_relative_time(report.get("scheduled_at") or report.get("started_at")), + "scheduled_by": str(report.get("scheduled_by") or ""), + "report_url": evaluator_api.get_job_report_url(report.get("project_id", ""), report.get("job_id") or report.get("id") or ""), + "fail_message": report.get("fail_message", ""), + "total": totals["total"], + "success": totals["success"], + "failed": totals["failed"], + "canceled": totals["canceled"], + "git_sha": str(source.get("git_sha") or "")[:12], + "git_ref_url": source.get("git_ref_url", ""), + "git_commit_url": source.get("git_commit_url", ""), + "source_url": git_url, + } + + +@st.cache_data(ttl=30, show_spinner=False) +def _fetch_recent_evaluator_job_pages( + project_id: str, + environment: str, + page_size: int, + pages_to_fetch: int, + status_values: tuple[str, ...] = (), + extra_filters: tuple[tuple[str, str, tuple[Any, ...]], ...] = (), +) -> List[Dict[str, Any]]: + """Fetch recent evaluator jobs from the search endpoint page-by-page.""" + if not project_id: + return [] + os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT + api = evaluator_api.EvaluationRunAPI() + filters: List[Dict[str, Any]] = [] + if status_values: + filters.append( + { + "field": "status", + "operator": "In", + "values": list(status_values), + } + ) + for field, operator, values in extra_filters: + filters.append( + { + "field": field, + "operator": operator, + "values": list(values), + } + ) + next_token = "" + pages: List[Dict[str, Any]] = [] + for _ in range(max(1, int(pages_to_fetch))): + data = api.search_report_list( + project_id, + filters=filters or None, + next_token=next_token, + size=max(1, min(int(page_size), 100)), + ) + reports = data.get("reports", []) or [] + pages.append( + { + "jobs": [_summarize_recent_job(report) for report in reports], + "next_token": data.get("next_token", "") or "", + } + ) + next_token = data.get("next_token", "") or "" + if not next_token: + break + return pages + + +@st.cache_data(ttl=30, show_spinner=False) +def _fetch_evaluator_job_detail(project_id: str, environment: str, job_id: str) -> Dict[str, Any]: + """Fetch deep evaluator detail for one job on demand.""" + if not project_id or not job_id: + return {} + os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT + api = evaluator_api.EvaluationRunAPI() + report = api.get_job_report(project_id, job_id) + suite_rows = api.get_suite_summary(project_id, job_id, use_available_case_results=True) + case_reports = api.get_case_reports(project_id, job_id) + summary = _summarize_recent_job(report) + return { + **summary, + "suite_rows": _extract_suite_rows(suite_rows), + "failed_case_rows": _extract_failed_case_rows(case_reports), + "raw_report": report, + } + + +def _inject_recent_evaluator_jobs_styles() -> None: + """Task-adjacent styles for the recent evaluator jobs section.""" + st.markdown( + """ + + """, + unsafe_allow_html=True, + ) + + +def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = "(Auto)") -> None: + """Render one recent evaluator job as a single-row list item.""" + variant = html.escape(job.get("status_variant", "unknown")) + status = html.escape(_status_display_label(job.get("status", "unknown") or "unknown")) + title_text = html.escape(job.get("title", "—")) + description = html.escape(job.get("description", "") or "") + catalog = html.escape(job.get("catalog", "") or "—") + catalog_url = html.escape(job.get("catalog_url", "") or "") + scheduled = html.escape(_format_jst_time_compact(job.get("scheduled_at"))) + duration = html.escape(job.get("duration", "—")) + job_id = html.escape(str(job.get("job_id", ""))) + build_status = html.escape(job.get("build_status", "") or "—") + test_status = html.escape(job.get("test_status", "") or "—") + created_label = html.escape(job.get("created_label", "—")) + git_sha = str(job.get("git_sha", "") or "").strip() + source_label = str(job.get("source_label", "") or "—").strip() + user_text = html.escape(user_label or "(Auto)") + report_url = html.escape(job.get("report_url", "") or "") + source_url = str(job.get("git_ref_url", "") or job.get("source_url", "") or "").strip() + git_commit_url = str(job.get("git_commit_url", "") or "").strip() + status_variant = job.get("status_variant", "unknown") + status_mark = { + "running": '', + "success": '', + "failed": '', + "canceled": '', + }.get(status_variant, '') + meta_line = job_id + total = int(job.get("total", 0) or 0) + success = int(job.get("success", 0) or 0) + failed = int(job.get("failed", 0) or 0) + canceled = int(job.get("canceled", 0) or 0) + if status_variant == "running" and total == 0 and success == 0 and failed == 0 and canceled == 0: + counts = "Running..." + else: + counts = ( + f'✅ {success} · ' + f'❌ {failed} · ' + f'⏹ {canceled} / ' + f'{total}' + ) + title_html = f'{title_text}' if report_url else title_text + source_html = _format_source_ref_html(source_label, source_url, git_sha, git_commit_url) + catalog_html = ( + f'{catalog}' + if catalog_url else catalog + ) + st.markdown( + f""" +
+
+
+
{title_html}
+
{meta_line}
+
+
+ {status_mark}{status} +
+
+ {scheduled} ({created_label})
{duration} +
+
+ {catalog_html}
{source_html} +
+
+ {user_text} +
+
+ build {build_status} · test {test_status}
+ {counts} +
+
+
+ """, + unsafe_allow_html=True, + ) + + +def _render_recent_evaluator_job_detail(project_id: str, environment: str, job: Dict[str, Any]) -> None: + """Render detailed evaluator-job information inside an expander.""" + job_id = str(job.get("job_id", "") or "") + if not job_id: + st.warning("Missing job id.") + return + try: + detail = _fetch_evaluator_job_detail(project_id, environment, job_id) + except Exception as e: + st.error(f"Could not fetch evaluator details: {e}") + return + + st.markdown("**Overview**") + top_cols = st.columns(4) + top_cols[0].metric("Total", int(detail.get("total", 0))) + top_cols[1].metric("Success", int(detail.get("success", 0))) + top_cols[2].metric("Failed", int(detail.get("failed", 0))) + top_cols[3].metric("Canceled", int(detail.get("canceled", 0))) + + overview_left, overview_right = st.columns([1.3, 1.1]) + with overview_left: + st.write(f"Status: `{detail.get('status', 'unknown')}`") + st.write(f"Title: `{detail.get('title', '—')}`") + st.write(f"Build/Test: `{detail.get('build_status', '—')}` / `{detail.get('test_status', '—')}`") + st.write(f"Ref: `{_format_source_ref_text(detail.get('target', ''), detail.get('git_sha', ''))}`") + st.write(f"Catalog: `{detail.get('catalog', '—')}`") + st.write(f"Repo: `{detail.get('source_repo_label', '—')}`") + with overview_right: + st.write(f"Scheduled: `{_format_jst_time(detail.get('scheduled_at'), include_seconds=True)}`") + st.write(f"Started: `{_format_jst_time(detail.get('started_at'), include_seconds=True)}`") + st.write(f"Finished: `{_format_jst_time(detail.get('finished_at'), include_seconds=True)}`") + st.write(f"Duration: `{detail.get('duration', '—')}`") + + action_cols = st.columns([1.2, 1.2, 4]) + report_url = detail.get("report_url", "") + catalog_url = detail.get("catalog_url", "") + source_url = detail.get("source_url", "") or detail.get("git_ref_url", "") + with action_cols[0]: + if report_url: + st.link_button("Open report", report_url, use_container_width=True) + with action_cols[1]: + if catalog_url: + st.link_button("Open catalog", catalog_url, use_container_width=True) + with action_cols[2]: + if source_url: + st.link_button("Open source", source_url, use_container_width=True) + + if detail.get("fail_message"): + st.warning(detail.get("fail_message")) + + suite_rows = detail.get("suite_rows") or [] + with st.expander(f"Suites ({len(suite_rows)})", expanded=bool(suite_rows)): + if suite_rows: + st.dataframe(pd.DataFrame(suite_rows), width="stretch", hide_index=True) + else: + st.caption("No suite summary available.") + + failed_case_rows = detail.get("failed_case_rows") or [] + with st.expander(f"Failed Cases ({len(failed_case_rows)})", expanded=bool(failed_case_rows)): + if failed_case_rows: + st.dataframe(pd.DataFrame(failed_case_rows), width="stretch", hide_index=True) + else: + st.caption("No failed cases in the current report.") + + with st.expander("Raw JSON", expanded=False): + st.json(detail.get("raw_report", {})) + + +def _render_recent_evaluator_job_run_dialog( + project_id: str, + environment: str, + job: Dict[str, Any], + *, + output_path_default: str, + download_type_default: str, + phase_default: str, + skip_large_file_default: bool, + large_file_mb_default: float, + keep_zip_files_default: bool, +) -> None: + """Render the dialog used to enqueue Download + Eval + Parquet from a recent job row.""" + job_id = str(job.get("job_id", "") or "") + if not job_id: + st.error("Missing evaluator job id.") + return + + detail = _fetch_evaluator_job_detail(project_id, environment, job_id) + suite_options = _extract_suite_selection_options(detail.get("suite_rows") or []) + suite_label_to_id = {opt["label"]: opt["id"] for opt in suite_options} + suite_labels = [opt["label"] for opt in suite_options] + + st.caption("Confirm the workflow options for this evaluator job, then start a background task.") + summary_cols = st.columns([1.45, 1.15, 1.35, 1.05]) + summary_cols[0].markdown(f"**Title** \n`{detail.get('title', '—')}`") + summary_cols[1].markdown(f"**Status** \n`{detail.get('status', 'unknown')}`") + summary_cols[2].markdown(f"**Catalog** \n`{detail.get('catalog', '—')}`") + summary_cols[3].markdown(f"**Cases** \n`{int(detail.get('total', 0))}`") + + with st.form(key=f"recent_eval_run_form_{job_id}", border=False): + run_output_path = st.text_input( + "Output path", + value=output_path_default, + help="Folder under the data directory. This uses the same safe path rules as the main download workflow.", + ) + + if not suite_labels: + hint_cols = st.columns([1.2, 2.8]) + with hint_cols[0]: + if st.form_submit_button("Refresh suites", use_container_width=True): + _fetch_evaluator_job_detail.clear() + st.rerun() + with hint_cols[1]: + st.caption("No suite candidates were available yet for this job. Refresh to re-read suite data from the evaluator API.") + + selected_suite_labels = st.multiselect( + "Suites to download (optional)", + options=suite_labels, + default=[], + help="Leave empty to download all suites from this evaluator job.", + disabled=not suite_labels, + ) + + run_download_type = st.radio( + "Download type", + ["Archives (ZIP)", "Result JSON only"], + index=0 if download_type_default == "Archives (ZIP)" else 1, + horizontal=True, + ) + + run_phase = "" + run_skip_large_file = False + run_large_file_mb = 50.0 + run_keep_zip_files = False + if run_download_type == "Archives (ZIP)": + run_phase = st.text_input( + "Phase to extract", + value=phase_default, + help="Enter the phase name to extract from archives.", + ) + opt_cols = st.columns([1.2, 1.3, 1.2]) + with opt_cols[0]: + run_skip_large_file = st.checkbox( + "Skip large files", + value=skip_large_file_default, + help="Skip unusually large archives during download.", + ) + with opt_cols[1]: + run_large_file_mb = st.number_input( + "Skip threshold (MB)", + min_value=1.0, + max_value=5000.0, + step=1.0, + value=float(large_file_mb_default), + ) + with opt_cols[2]: + run_keep_zip_files = st.checkbox( + "Keep ZIP files", + value=keep_zip_files_default, + help="Keep downloaded ZIPs after extraction.", + ) + + run_cols = st.columns([1.25, 1.25, 1.1]) + with run_cols[0]: + run_eval = st.checkbox( + "Run evaluation", + value=True, + help="Run eval_result and generate Summary.csv / Score.csv after download.", + ) + with run_cols[1]: + generate_parquet = st.checkbox( + "Generate parquet", + value=CATALOG_IO_AVAILABLE, + disabled=not CATALOG_IO_AVAILABLE, + help="Build scene_result.parquet from .pkl files." if CATALOG_IO_AVAILABLE else "Install perception_catalog_analyzer to enable this.", + ) + with run_cols[2]: + eval_recursive = st.checkbox( + "Recursive eval", + value=True, + help="Search subdirectories for evaluation result folders.", + ) + + action_cols = st.columns([1.15, 1.15, 3.7]) + cancel_clicked = action_cols[0].form_submit_button("Cancel", use_container_width=True) + start_clicked = action_cols[1].form_submit_button("Start", type="primary", use_container_width=True) + + if cancel_clicked: + st.session_state.pop("recent_eval_jobs_run_selected", None) + st.rerun() + + if not start_clicked: + return + + resolved_output, path_err = resolve_under_data_root(run_output_path, allow_create=True) + if path_err: + st.error(f"Output path is invalid: {path_err}") + return + + selected_suite_ids = [suite_label_to_id[label] for label in selected_suite_labels] + resolved_path_str = str(resolved_output) + set_config_value("output_path", to_data_relative(resolved_output)) + set_config_value("environment", environment) + set_config_value("project_id", project_id) + set_config_value("job_id", job_id) + set_config_value("suite_id", "") + set_config_value("suite_ids", selected_suite_ids) + set_config_value("download_type", run_download_type) + if run_download_type == "Archives (ZIP)": + set_config_value("phase", run_phase) + set_config_value("skip_large_file", run_skip_large_file) + set_config_value("large_file_mb", run_large_file_mb) + set_config_value("keep_zip_files", run_keep_zip_files) + + params = { + "output_path": resolved_path_str, + "project_id": project_id, + "job_id": job_id, + "suite_id": "", + "suite_ids": selected_suite_ids or None, + "download_type": "archives" if run_download_type == "Archives (ZIP)" else "result_json", + "phase": run_phase if run_download_type == "Archives (ZIP)" else "", + "skip_large_file": run_skip_large_file if run_download_type == "Archives (ZIP)" else False, + "large_file_mb": run_large_file_mb if run_download_type == "Archives (ZIP)" else 50.0, + "keep_zip_files": run_keep_zip_files if run_download_type == "Archives (ZIP)" else False, + "run_eval": run_eval, + "generate_parquet": generate_parquet, + "eval_recursive": eval_recursive, + "eval_overwrite": False, + "eval_workers": _default_eval_workers(), + } + task_id = _enqueue_task("download_and_eval", params) + if not task_id: + st.error("Failed to enqueue task. Check REDIS_URL and DATABASE_URL.") + return + + st.session_state["recent_eval_jobs_flash"] = ( + f"Queued Download + Eval + Parquet for `{detail.get('title', job_id)}`. " + f"Task id: `{task_id}`." + ) + st.session_state.pop("recent_eval_jobs_run_selected", None) + st.rerun() + + +def _render_recent_evaluator_job_retest_dialog( + project_id: str, + environment: str, + job: Dict[str, Any], + *, + output_path_default: str, + phase_default: str, +) -> None: + """Render a compact workflow launcher that reuses build artifacts from a prior evaluator job.""" + job_id = str(job.get("job_id", "") or "") + if not job_id: + st.error("Missing evaluator job id.") + return + + detail = _fetch_evaluator_job_detail(project_id, environment, job_id) + raw_report = detail.get("raw_report") or {} + raw_catalog = raw_report.get("catalog") or {} + resolved_source_job_id = _resolve_retest_source_job_id( + project_id, + environment, + job_id, + detail=detail, + ) + suite_options = _extract_suite_selection_options(detail.get("suite_rows") or []) + suite_label_to_id = {opt["label"]: opt["id"] for opt in suite_options} + suite_labels = [opt["label"] for opt in suite_options] + preset_entries = _load_catalog_presets() + preset_names = [str(entry.get("display_name") or "").strip() for entry in preset_entries if str(entry.get("display_name") or "").strip()] + preset_by_name = {str(entry.get("display_name") or "").strip(): entry for entry in preset_entries} + + original_catalog_name = str(raw_catalog.get("display_name") or detail.get("catalog") or "").strip() + original_catalog_id = str(raw_catalog.get("id") or "").strip() + default_preset_name = original_catalog_name if original_catalog_name in preset_by_name else "" + + import re + + default_output_path = output_path_default + if not default_output_path: + clean_target = re.sub(r"[^\w]+", "_", str(detail.get("target") or job_id).strip()).strip("_") or "artifact" + default_output_path = f"retest_{clean_target}_{datetime.now().strftime('%Y%m%d_%H%M%S')}" + + st.caption("Schedule a new evaluator workflow that reuses build artifacts from this job, then download and process the new results.") + summary_cols = st.columns([1.35, 1.0, 1.25, 1.2]) + summary_cols[0].markdown(f"**Source job** \n`{job_id}`") + summary_cols[1].markdown(f"**Ref** \n`{detail.get('target', '—')}`") + summary_cols[2].markdown(f"**Original catalog** \n`{original_catalog_name or '—'}`") + summary_cols[3].markdown(f"**Suites found** \n`{len(suite_labels)}`") + if resolved_source_job_id and resolved_source_job_id != job_id: + st.caption(f"Using upstream source job `{resolved_source_job_id}` for scheduling because this job is already a retest.") + + preset_key = f"recent_eval_retest_catalog_preset_{job_id}" + last_preset_key = f"recent_eval_retest_last_catalog_preset_{job_id}" + catalog_id_key = f"recent_eval_retest_catalog_id_{job_id}" + suite_selection_key = _retest_suite_selection_key(job_id) + if preset_key not in st.session_state: + st.session_state[preset_key] = default_preset_name + if last_preset_key not in st.session_state: + st.session_state[last_preset_key] = "" + if catalog_id_key not in st.session_state: + st.session_state[catalog_id_key] = original_catalog_id + if suite_selection_key not in st.session_state: + st.session_state[suite_selection_key] = [] + + selected_preset_name = st.selectbox( + "Catalog preset", + options=[""] + preset_names, + index=([""] + preset_names).index(default_preset_name) if default_preset_name in preset_names else 0, + key=preset_key, + help="Choose a preset catalog, or leave this empty and enter a catalog id manually.", + format_func=lambda value: value or "Custom / manual", + ) + selected_preset = preset_by_name.get(selected_preset_name or "", {}) + if st.session_state[last_preset_key] != selected_preset_name and selected_preset_name: + st.session_state[catalog_id_key] = str(selected_preset.get("catalog_id") or "") + st.session_state[last_preset_key] = selected_preset_name + elif st.session_state[last_preset_key] != selected_preset_name and not selected_preset_name: + st.session_state[catalog_id_key] = original_catalog_id + st.session_state[last_preset_key] = selected_preset_name + catalog_id = st.text_input( + "Catalog ID", + value="", + key=catalog_id_key, + help="You can switch to a different catalog while still reusing the build artifacts from the source job.", + ).strip() + + selected_suite_labels = st.multiselect( + "Suites to run", + options=suite_labels, + key=suite_selection_key, + help="Defaults to empty. Leave it empty to let the evaluator use its default suite selection, or choose specific suites to rerun.", + disabled=not suite_labels, + ) + description = st.text_input( + "Description", + value="", + help="Leave empty to use an automatic evaluator artifact-retest name.", + ).strip() + retest_output_path = st.text_input( + "Output path", + value=default_output_path, + help="Folder under the data directory for the downloaded retest results.", + ) + run_download_type = st.radio( + "Download type", + ["Archives (ZIP)", "Result JSON only"], + index=0, + horizontal=True, + ) + run_phase = "" + if run_download_type == "Archives (ZIP)": + run_phase = st.text_input( + "Phase to extract", + value=phase_default, + help="Enter the phase name to extract from archives.", + ) + + run_cols = st.columns([1.2, 1.2, 1.0]) + with run_cols[0]: + run_eval = st.checkbox( + "Run evaluation", + value=True, + help="Run eval_result and generate Summary.csv / Score.csv after download.", + ) + with run_cols[1]: + generate_parquet = st.checkbox( + "Generate parquet", + value=CATALOG_IO_AVAILABLE, + disabled=not CATALOG_IO_AVAILABLE, + help="Build scene_result.parquet from .pkl files." if CATALOG_IO_AVAILABLE else "Install perception_catalog_analyzer to enable this.", + ) + with run_cols[2]: + eval_recursive = st.checkbox( + "Recursive eval", + value=True, + help="Search subdirectories for evaluation result folders.", + ) + + action_cols = st.columns([1.15, 1.15, 3.7]) + cancel_clicked = action_cols[0].button("Cancel", key=f"recent_eval_retest_cancel_{job_id}", use_container_width=True) + start_clicked = action_cols[1].button("Retest", key=f"recent_eval_retest_start_{job_id}", type="primary", use_container_width=True) + + if cancel_clicked: + st.session_state.pop(suite_selection_key, None) + st.session_state.pop("recent_eval_jobs_retest_selected", None) + st.rerun() + + if not start_clicked: + return + + final_catalog_id = str(selected_preset.get("catalog_id") or catalog_id or "").strip() + if not final_catalog_id: + st.error("Catalog ID is required.") + return + + resolved_output, path_err = resolve_under_data_root(retest_output_path, allow_create=True) + if path_err: + st.error(f"Output path is invalid: {path_err}") + return + + selected_suite_ids = [suite_label_to_id[label] for label in selected_suite_labels] + resolved_path_str = str(resolved_output) + has_custom_catalog = bool(final_catalog_id and not selected_preset_name) + final_description = description or _make_retest_description( + str(detail.get("target") or job_id), + selected_preset_name, + has_custom_catalog=has_custom_catalog, + ) + + task_id = _enqueue_task( + "run_evaluator_and_process", + { + "project_id": project_id, + "catalog_id": final_catalog_id, + "integration_id": "", + "source_job_id": resolved_source_job_id or job_id, + "suite_ids": selected_suite_ids or None, + "target_name": "", + "description": final_description, + "output_path": resolved_path_str, + "environment": environment, + "max_retries": 0, + "clean_build": False, + "debug": False, + "is_tag": False, + "download_type": "archives" if run_download_type == "Archives (ZIP)" else "result_json", + "phase": run_phase, + "skip_large_file": False, + "large_file_mb": 50.0, + "keep_zip_files": False, + "poll_interval": 60, + "max_wait_seconds": 6 * 3600, + "run_eval": run_eval, + "generate_parquet": generate_parquet, + "eval_recursive": eval_recursive, + "eval_overwrite": False, + "eval_workers": _default_eval_workers(), + }, + ) + if not task_id: + st.error("Failed to enqueue task. Check REDIS_URL and DATABASE_URL.") + return + + set_config_value("output_path", to_data_relative(resolved_output)) + set_config_value("environment", environment) + set_config_value("project_id", project_id) + set_config_value("catalog_id", final_catalog_id) + set_config_value("suite_ids", selected_suite_ids) + + st.session_state["recent_eval_jobs_flash"] = ( + f"Queued artifact retest for `{detail.get('title', job_id)}`. " + f"Task id: `{task_id}`." + ) + st.session_state.pop(suite_selection_key, None) + st.session_state.pop("recent_eval_jobs_retest_selected", None) + st.rerun() + + +def _render_recent_evaluator_jobs_section( + project_id: str, + environment: str, + *, + output_path_default: str, + download_type_default: str, + phase_default: str, + skip_large_file_default: bool, + large_file_mb_default: float, + keep_zip_files_default: bool, + show_toggle: bool = True, + default_visible: bool = False, + show_title: bool = True, +) -> None: + """Render a direct evaluator-jobs browser above the download tabs.""" + _inject_recent_evaluator_jobs_styles() + if show_toggle: + show_section = st.toggle( + "Show recent evaluator jobs", + value=st.session_state.get("recent_eval_jobs_show", default_visible), + key="recent_eval_jobs_show", + help="Load recent evaluator jobs only when you want to browse them.", + ) + else: + show_section = True + st.session_state["recent_eval_jobs_show"] = True + if not show_section: + return + + if show_title: + st.subheader("Recent evaluator jobs") + st.caption("Compact browser for recent evaluator jobs. Select one job to inspect detailed suite and failed-case information.") + flash_message = st.session_state.pop("recent_eval_jobs_flash", None) + if flash_message: + st.success(flash_message) + user_directory = _get_recent_eval_user_directory() + + control_cols = st.columns([0.75, 1.0, 1.15, 1.45, 1.25, 1.0, 1.0, 0.75]) + with control_cols[0]: + st.markdown('
Rows
', unsafe_allow_html=True) + limit = int( + st.selectbox( + "Rows", + options=[10, 20, 50, 100], + index=1, + key="recent_eval_jobs_limit", + help="How many recent evaluator jobs to fetch for this project.", + label_visibility="collapsed", + ) + ) + with control_cols[1]: + st.markdown('
Status
', unsafe_allow_html=True) + status_filter = st.multiselect( + "Status", + options=["running", "success", "failed", "canceled", "unknown"], + default=[], + key="recent_eval_jobs_status_filter", + help="Leave empty to show all recent jobs.", + label_visibility="collapsed", + placeholder="All statuses", + ) + with control_cols[2]: + st.markdown('
Search In
', unsafe_allow_html=True) + search_scope = st.selectbox( + "Search in", + options=["Branch/tag", "Description", "Job ID", "Git SHA", "Fail message"], + index=1, + key="recent_eval_jobs_search_scope", + help="Choose which evaluator field the quick search should target.", + label_visibility="collapsed", + ) + with control_cols[3]: + st.markdown('
Search
', unsafe_allow_html=True) + search_text = st.text_input( + "Search", + value=st.session_state.get("recent_eval_jobs_search_text", ""), + key="recent_eval_jobs_search_text", + help="Server-side search across the selected field.", + label_visibility="collapsed", + placeholder="Type to search evaluator jobs", + ).strip() + selected_user_name = "" + user_candidates = sorted( + { + info.get("name", "").strip() + for info in user_directory.values() + if info.get("name", "").strip() + }, + key=str.lower, + ) + with control_cols[4]: + st.markdown('
User
', unsafe_allow_html=True) + selected_user_name = st.selectbox( + "User", + options=[""] + user_candidates, + index=0, + key="recent_eval_jobs_user_filter", + help="Filter jobs by resolved scheduled user name.", + label_visibility="collapsed", + ) + with control_cols[5]: + st.markdown('
From
', unsafe_allow_html=True) + date_from = st.date_input( + "From", + value=st.session_state.get("recent_eval_jobs_date_from", None), + key="recent_eval_jobs_date_from", + label_visibility="collapsed", + help="Scheduled-at lower bound in JST.", + ) + with control_cols[6]: + st.markdown('
To
', unsafe_allow_html=True) + date_to = st.date_input( + "To", + value=st.session_state.get("recent_eval_jobs_date_to", None), + key="recent_eval_jobs_date_to", + label_visibility="collapsed", + help="Scheduled-at upper bound in JST.", + ) + with control_cols[7]: + st.markdown('
Actions
', unsafe_allow_html=True) + if st.button("Refresh", key="refresh_recent_eval_jobs", use_container_width=True): + _fetch_recent_evaluator_job_pages.clear() + _fetch_evaluator_job_detail.clear() + st.rerun() + + page_key = "recent_eval_jobs_page" + if page_key not in st.session_state: + st.session_state[page_key] = 1 + if date_from and date_to and date_from > date_to: + st.warning("`From` date must be earlier than or equal to `To` date.") + return + + def _render_job_list() -> None: + nonlocal user_directory + if not project_id: + st.info("Enter a project id to browse recent evaluator jobs.") + return + current_page = max(1, int(st.session_state.get(page_key, 1))) + pages_to_fetch = max(3, current_page + 2) + if search_text or status_filter or date_from or date_to or selected_user_name: + pages_to_fetch = max(pages_to_fetch, 6) + server_status_values = tuple(_status_filter_values(status_filter)) + server_search_filter, search_needle = _build_recent_job_search_filter(search_text, search_scope, user_directory) + selected_user_ids = sorted( + { + subject_id + for subject_id, info in user_directory.items() + if selected_user_name + and selected_user_name.lower() == str(info.get("name") or "").strip().lower() + } + ) + server_date_filters = _build_recent_job_date_filters(date_from, date_to) + extra_filters: List[Dict[str, Any]] = [] + if server_search_filter: + extra_filters.append(server_search_filter) + if selected_user_ids: + extra_filters.append( + { + "field": "scheduled_by", + "operator": "In", + "values": selected_user_ids, + } + ) + extra_filters.extend(server_date_filters) + extra_filter_tuples = tuple( + ( + str(f["field"]), + str(f["operator"]), + tuple(f.get("values", []) or []), + ) + for f in extra_filters + ) + fetch_help = "Loading evaluator jobs..." + if search_text or status_filter or date_from or date_to or selected_user_name: + fetch_help = "Loading evaluator jobs with filters..." + try: + with st.spinner(fetch_help): + fetched_pages = _fetch_recent_evaluator_job_pages( + project_id, + environment, + limit, + pages_to_fetch, + status_values=server_status_values, + extra_filters=extra_filter_tuples, + ) + except requests.Timeout: + st.error("Timed out while loading evaluator jobs. The evaluator server may be slow right now. Try Refresh.") + return + except requests.RequestException as e: + st.error(_friendly_request_error_message(e)) + return + except Exception as e: + st.error(_friendly_request_error_message(e)) + return + if search_text: + _save_recent_job_search_history(search_scope, search_text) + + jobs = [job for page in fetched_pages for job in page.get("jobs", [])] + user_directory = _hydrate_recent_eval_user_directory(jobs, environment) + has_more_from_api = bool(fetched_pages and fetched_pages[-1].get("next_token")) + + if not fetched_pages: + st.warning("No response was returned from the evaluator server. Try Refresh.") + return + + if search_needle: + if search_scope == "Branch/tag": + jobs = [job for job in jobs if search_needle in str(job.get("target", "")).lower()] + elif search_scope == "Description": + jobs = [job for job in jobs if search_needle in str(job.get("description", "")).lower() or search_needle in str(job.get("title", "")).lower()] + elif search_scope == "Job ID": + jobs = [job for job in jobs if search_needle in str(job.get("job_id", "")).lower()] + elif search_scope == "Git SHA": + jobs = [job for job in jobs if search_needle in str(job.get("git_sha", "")).lower()] + elif search_scope == "Fail message": + jobs = [job for job in jobs if search_needle in str(job.get("fail_message", "")).lower()] + if selected_user_name: + selected_lower = selected_user_name.lower() + jobs = [ + job for job in jobs + if selected_lower == str((user_directory.get(str(job.get("scheduled_by") or "").strip(), {}) or {}).get("name", "")).strip().lower() + ] + if status_filter: + selected = {evaluator_api.normalize_job_status(v) for v in status_filter} + jobs = [job for job in jobs if job.get("status_variant") in selected or evaluator_api.normalize_job_status(job.get("status", "")) in selected] + + if not jobs: + st.session_state[page_key] = 1 + empty_message = "No recent evaluator jobs were returned." + if search_text or status_filter or date_from or date_to or selected_user_name: + empty_message = "No recent evaluator jobs matched the current filters." + st.markdown(f'
{html.escape(empty_message)}
', unsafe_allow_html=True) + return + + total_loaded = len(jobs) + has_next_page = total_loaded > current_page * limit or has_more_from_api + max_known_page = max(1, (total_loaded + limit - 1) // limit) + if current_page > max_known_page: + current_page = max_known_page + st.session_state[page_key] = current_page + start_idx = (current_page - 1) * limit + end_idx = start_idx + limit + visible_jobs = jobs[start_idx:end_idx] + if not visible_jobs and current_page > 1: + current_page = max(1, current_page - 1) + st.session_state[page_key] = current_page + start_idx = (current_page - 1) * limit + end_idx = start_idx + limit + visible_jobs = jobs[start_idx:end_idx] + has_next_page = total_loaded > current_page * limit + + if current_page == 1: + page_numbers = list(range(1, min(3, max_known_page) + 1)) + else: + page_numbers = list( + range( + max(1, current_page - 1), + min(max_known_page, current_page + 1) + 1, + ) + ) + pager_cols = st.columns([0.8, 0.9, 0.9, 0.9, 0.8, 5.7]) + with pager_cols[0]: + if st.button("‹", key="recent_eval_jobs_prev", use_container_width=True, disabled=current_page <= 1): + st.session_state[page_key] = max(1, current_page - 1) + st.rerun() + for idx, page_num in enumerate(page_numbers[:3], start=1): + with pager_cols[idx]: + btn_key = ( + f"recent_eval_jobs_pagebtn_active_{page_num}" + if page_num == current_page + else f"recent_eval_jobs_pagebtn_{page_num}" + ) + if st.button( + str(page_num), + key=btn_key, + use_container_width=True, + disabled=page_num == current_page, + ): + st.session_state[page_key] = page_num + st.rerun() + with pager_cols[4]: + if st.button("›", key="recent_eval_jobs_next", use_container_width=True, disabled=not has_next_page): + st.session_state[page_key] = current_page + 1 + st.rerun() + + selected_job_id = st.session_state.get("recent_eval_jobs_selected") + if selected_job_id and not any(str(job.get("job_id", "")) == str(selected_job_id) for job in jobs): + st.session_state.pop("recent_eval_jobs_selected", None) + selected_job_id = None + + selected_run_job_id = st.session_state.get("recent_eval_jobs_run_selected") + if selected_run_job_id and not any(str(job.get("job_id", "")) == str(selected_run_job_id) for job in jobs): + st.session_state.pop("recent_eval_jobs_run_selected", None) + selected_run_job_id = None + + selected_retest_job_id = st.session_state.get("recent_eval_jobs_retest_selected") + if selected_retest_job_id and not any(str(job.get("job_id", "")) == str(selected_retest_job_id) for job in jobs): + st.session_state.pop("recent_eval_jobs_retest_selected", None) + selected_retest_job_id = None + + st.markdown('
', unsafe_allow_html=True) + for job in visible_jobs: + subject_id = str(job.get("scheduled_by") or "").strip() + user_info = user_directory.get(subject_id, {}) + user_label = str(user_info.get("name") or subject_id or "(Auto)").strip() + row_cols = st.columns([9.2, 2.6]) + with row_cols[0]: + _render_recent_evaluator_job_card(job, user_label=user_label) + with row_cols[1]: + action_cols = st.columns([1.0, 1.0, 1.0], gap="small") + with action_cols[0]: + if st.button("Details", key=f"recent_eval_view_{job['job_id']}", use_container_width=True): + st.session_state["recent_eval_jobs_selected"] = str(job["job_id"]) + _fetch_evaluator_job_detail.clear() + st.rerun() + with action_cols[1]: + if st.button("Start", key=f"recent_eval_run_{job['job_id']}", use_container_width=True): + st.session_state["recent_eval_jobs_run_selected"] = str(job["job_id"]) + _fetch_evaluator_job_detail.clear() + st.rerun() + with action_cols[2]: + if st.button("Retest", key=f"recent_eval_retest_{job['job_id']}", use_container_width=True): + st.session_state.pop(_retest_suite_selection_key(str(job["job_id"])), None) + st.session_state["recent_eval_jobs_retest_selected"] = str(job["job_id"]) + _fetch_evaluator_job_detail.clear() + st.rerun() + st.markdown("
", unsafe_allow_html=True) + + selected_job_id = st.session_state.get("recent_eval_jobs_selected") + if selected_job_id: + selected_job = next((job for job in jobs if str(job.get("job_id", "")) == str(selected_job_id)), None) + if selected_job: + if callable(getattr(st, "dialog", None)): + try: + @st.dialog(f"Job details · {selected_job.get('title', '—')}", width="large") + def _recent_eval_job_dialog() -> None: + _render_recent_evaluator_job_detail(project_id, environment, selected_job) + if st.button("Close", key="recent_eval_jobs_close_detail", use_container_width=True): + st.session_state.pop("recent_eval_jobs_selected", None) + st.rerun() + + _recent_eval_job_dialog() + finally: + st.session_state.pop("recent_eval_jobs_selected", None) + else: + st.markdown('
', unsafe_allow_html=True) + hdr_cols = st.columns([4.4, 1.1]) + with hdr_cols[0]: + st.subheader(f"Job details · {selected_job.get('title', '—')}") + with hdr_cols[1]: + if st.button("Close", key="recent_eval_jobs_close_detail_fallback", use_container_width=True): + st.session_state.pop("recent_eval_jobs_selected", None) + st.rerun() + _render_recent_evaluator_job_detail(project_id, environment, selected_job) + st.markdown("
", unsafe_allow_html=True) + + selected_run_job_id = st.session_state.get("recent_eval_jobs_run_selected") + if selected_run_job_id: + selected_run_job = next((job for job in jobs if str(job.get("job_id", "")) == str(selected_run_job_id)), None) + if selected_run_job: + if callable(getattr(st, "dialog", None)): + try: + @st.dialog(f"Download + Eval + Parquet · {selected_run_job.get('title', '—')}", width="large") + def _recent_eval_run_dialog() -> None: + _render_recent_evaluator_job_run_dialog( + project_id, + environment, + selected_run_job, + output_path_default=output_path_default, + download_type_default=download_type_default, + phase_default=phase_default, + skip_large_file_default=skip_large_file_default, + large_file_mb_default=large_file_mb_default, + keep_zip_files_default=keep_zip_files_default, + ) + + _recent_eval_run_dialog() + finally: + if st.session_state.get("recent_eval_jobs_run_selected") == str(selected_run_job_id): + st.session_state.pop("recent_eval_jobs_run_selected", None) + else: + st.markdown('
', unsafe_allow_html=True) + hdr_cols = st.columns([4.4, 1.1]) + with hdr_cols[0]: + st.subheader(f"Download + Eval + Parquet · {selected_run_job.get('title', '—')}") + with hdr_cols[1]: + if st.button("Close", key="recent_eval_jobs_close_run_fallback", use_container_width=True): + st.session_state.pop("recent_eval_jobs_run_selected", None) + st.rerun() + _render_recent_evaluator_job_run_dialog( + project_id, + environment, + selected_run_job, + output_path_default=output_path_default, + download_type_default=download_type_default, + phase_default=phase_default, + skip_large_file_default=skip_large_file_default, + large_file_mb_default=large_file_mb_default, + keep_zip_files_default=keep_zip_files_default, + ) + st.markdown("
", unsafe_allow_html=True) + + selected_retest_job_id = st.session_state.get("recent_eval_jobs_retest_selected") + if selected_retest_job_id: + selected_retest_job = next((job for job in jobs if str(job.get("job_id", "")) == str(selected_retest_job_id)), None) + if selected_retest_job: + if callable(getattr(st, "dialog", None)): + try: + @st.dialog(f"Artifact retest · {selected_retest_job.get('title', '—')}", width="large") + def _recent_eval_retest_dialog() -> None: + _render_recent_evaluator_job_retest_dialog( + project_id, + environment, + selected_retest_job, + output_path_default=output_path_default, + phase_default=phase_default, + ) + + _recent_eval_retest_dialog() + finally: + if st.session_state.get("recent_eval_jobs_retest_selected") == str(selected_retest_job_id): + st.session_state.pop("recent_eval_jobs_retest_selected", None) + else: + st.markdown('
', unsafe_allow_html=True) + hdr_cols = st.columns([4.4, 1.1]) + with hdr_cols[0]: + st.subheader(f"Artifact retest · {selected_retest_job.get('title', '—')}") + with hdr_cols[1]: + if st.button("Close", key="recent_eval_jobs_close_retest_fallback", use_container_width=True): + st.session_state.pop("recent_eval_jobs_retest_selected", None) + st.rerun() + _render_recent_evaluator_job_retest_dialog( + project_id, + environment, + selected_retest_job, + output_path_default=output_path_default, + phase_default=phase_default, + ) + st.markdown("
", unsafe_allow_html=True) + + _render_job_list() diff --git a/evaluation_dashboard_app/lib/ui/styles_global.py b/evaluation_dashboard_app/lib/ui/styles_global.py index f4118e8..a66be10 100644 --- a/evaluation_dashboard_app/lib/ui/styles_global.py +++ b/evaluation_dashboard_app/lib/ui/styles_global.py @@ -49,21 +49,21 @@ def inject_app_page_styles() -> None: """, unsafe_allow_html=True, ) - try: - from lib.deploy_debug import running_in_docker + # try: + # from lib.deploy_debug import running_in_docker - if not running_in_docker(): - st.markdown( - """ - - """, - unsafe_allow_html=True, - ) - except Exception: - pass + # if not running_in_docker(): + # st.markdown( + # """ + # + # """, + # unsafe_allow_html=True, + # ) + # except Exception: + # pass diff --git a/evaluation_dashboard_app/lib/ui/task_history.py b/evaluation_dashboard_app/lib/ui/task_history.py new file mode 100644 index 0000000..e5d05f5 --- /dev/null +++ b/evaluation_dashboard_app/lib/ui/task_history.py @@ -0,0 +1,280 @@ +"""Shared task history/list rendering used across pages.""" + +from __future__ import annotations + +import json +from datetime import datetime, timedelta, timezone +from typing import Any, Dict, List, Optional + +import streamlit as st + +from lib.auth import get_current_user_id, is_auth_enabled +from lib.db import delete_task, get_task +from lib.ui.download_ui import TaskCardMode, render_task_list_empty_state, task_list_card_markup +from lib.ui.task_result_summary import render_task_result_summary + +_JST = timezone(timedelta(hours=9)) + + +def _to_jst(dt: Any) -> Optional[datetime]: + """Convert datetime to JST for display. Naive datetimes are assumed UTC.""" + if dt is None: + return None + if not hasattr(dt, "astimezone"): + return None + try: + if getattr(dt, "tzinfo", None) is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(_JST) + except Exception: + return None + + +def _task_type_label(task_type: str) -> str: + labels = { + "download_results": "Download results", + "download_scenarios": "Download scenarios", + "run_eval_dirs": "Run eval dirs", + "generate_summary_csv": "Generate summary CSV", + "build_parquet": "Build parquet", + "download_and_eval": "Download + Eval", + "run_evaluator_and_process": "Run Evaluator + Process", + "run_release_specsheet_workflow": "Release Specsheet", + } + return labels.get(task_type, task_type or "Task") + + +def _task_summary(t: Dict[str, Any]) -> str: + params = t.get("parameters") or {} + task_type = t.get("type", "") + if task_type == "download_results": + out = params.get("output_path") or params.get("job_id") or "" + return f"job_id={params.get('job_id', '')} → {out}" + if task_type == "download_scenarios": + out = params.get("output_dir") or params.get("output_path") or "" + return f"job_id={params.get('job_id', '')} → {out}" + if task_type in ("run_eval_dirs", "generate_summary_csv"): + return params.get("eval_root", "") + if task_type == "build_parquet": + return params.get("pkl_dir", "") + if task_type == "download_and_eval": + out = params.get("output_path") or params.get("job_id") or "" + parts = ["download"] + if params.get("run_eval"): + parts.append("eval") + if params.get("generate_parquet"): + parts.append("parquet") + return f"job_id={params.get('job_id', '')} [{'+'.join(parts)}] → {out}" + if task_type == "run_evaluator_and_process": + target = params.get("target_name", "") + target_type = "tag" if params.get("is_tag", False) else "branch" + return f"{target_type}={target} → {params.get('output_path', '')}" + if task_type == "run_release_specsheet_workflow": + target = params.get("target_name", "") + target_type = "tag" if params.get("is_tag", False) else "branch" + return f"{target_type}={target} → {params.get('output_path', '')}" + return "" + + +def _task_time_str(t: Dict[str, Any]) -> str: + created = t.get("created_at") + dt = _to_jst(created) if created else None + if not dt: + return "—" + try: + return dt.strftime("%b %d, %H:%M") + except Exception: + return str(created)[:16] if created else "—" + + +def _task_duration(t: Dict[str, Any]) -> Optional[str]: + created = t.get("created_at") + updated = t.get("updated_at") + if not created or not updated: + return None + try: + start = created.timestamp() if hasattr(created, "timestamp") else None + end = updated.timestamp() if hasattr(updated, "timestamp") else None + if start is None or end is None: + return None + secs = int(end - start) + if secs < 60: + return f"{secs}s" + if secs < 3600: + return f"{secs // 60}m {secs % 60}s" + return f"{secs // 3600}h {(secs % 3600) // 60}m" + except Exception: + return None + + +def render_task_detail_content(t: Dict[str, Any]) -> None: + """Render full task detail content.""" + try: + _render_task_detail_content_impl(t) + except Exception as e: + st.error(f"Could not load task details: {e}") + import traceback + st.code(traceback.format_exc(), language=None) + + +def _render_task_detail_content_impl(t: Dict[str, Any]) -> None: + status = t.get("status", "") + created_jst = _to_jst(t.get("created_at")) + updated_jst = _to_jst(t.get("updated_at")) + time_parts = [] + if created_jst: + try: + time_parts.append(f"Created: {created_jst.strftime('%Y-%m-%d %H:%M:%S')} JST") + except Exception: + time_parts.append(f"Created: {t.get('created_at')}") + if updated_jst and updated_jst != created_jst: + try: + time_parts.append(f"Updated: {updated_jst.strftime('%Y-%m-%d %H:%M:%S')} JST") + except Exception: + time_parts.append(f"Updated: {t.get('updated_at')}") + if time_parts: + st.caption(" · ".join(time_parts)) + + result_summary_raw = t.get("result_summary") + if result_summary_raw: + try: + result_summary = json.loads(result_summary_raw) if isinstance(result_summary_raw, str) else result_summary_raw + render_task_result_summary(result_summary) + st.markdown("---") + except (TypeError, ValueError): + pass + if t.get("result_path"): + st.text_input( + "Result path", + value=t["result_path"], + key=f"rp_modal_{str(t.get('id'))}", + disabled=True, + label_visibility="collapsed", + ) + if status == "failed" and t.get("error_message"): + st.error(t.get("error_message")) + progress_message = (t.get("progress_message") or "").strip() + if progress_message: + st.info(progress_message) + log_output = (t.get("log_output") or "").strip() + if log_output: + st.caption("Log output") + st.code(log_output, language=None) + params = t.get("parameters") or {} + if params: + st.caption("Parameters") + st.json(params) + + +def _open_task_detail(task_id: str) -> None: + st.session_state["_task_detail_id"] = str(task_id) + + +def _render_one_task_row( + t: Dict[str, Any], + current_user: Optional[str], + use_dialog: bool, + *, + mode: TaskCardMode, +) -> None: + task_id = t.get("id", "") + status = t.get("status", "") + status_labels = {"pending": "Pending", "running": "Running", "completed": "Completed", "failed": "Failed"} + status_label = status_labels.get(status, status) + summary = _task_summary(t) + sid = str(task_id) + summary_short = ( + (summary[:72] + "…") if mode == "history" and summary and len(summary) > 72 else (summary if mode == "history" else "—") + ) or "—" + progress_msg = (t.get("progress_message") or "").strip() + card = task_list_card_markup( + task_id=sid, + type_label=_task_type_label(t.get("type", "")), + status=status, + status_label=status_label, + time_str=_task_time_str(t), + duration=_task_duration(t) or "—", + summary_short=summary_short, + progress_pct=t.get("progress_pct"), + progress_message=progress_msg, + mode=mode, + ) + st.markdown(f'
{card}
', unsafe_allow_html=True) + + if use_dialog: + bv, bd, _sp = st.columns([1.15, 1.15, 4]) + with bv: + st.button("View", key=f"view_{sid}", on_click=_open_task_detail, args=(sid,)) + with bd: + stop_lbl = "Stop" if status in ("pending", "running") else "Remove" + stop_help = ( + "Cancels the Redis/RQ job when possible, then removes this row from the list." + if status in ("pending", "running") + else "Remove this row from the task list." + ) + if st.button(stop_lbl, key=f"del_{sid}", type="secondary", help=stop_help): + delete_task(sid, session_id=current_user) + st.rerun() + else: + bd, _sp = st.columns([1.15, 4]) + with bd: + stop_lbl = "Stop" if status in ("pending", "running") else "Remove" + stop_help = ( + "Cancels the Redis/RQ job when possible, then removes this row from the list." + if status in ("pending", "running") + else "Remove this row from the task list." + ) + if st.button(stop_lbl, key=f"del_{sid}", type="secondary", help=stop_help): + delete_task(sid, session_id=current_user) + st.rerun() + + if not use_dialog: + with st.expander("More", expanded=False): + render_task_detail_content(t) + + +def render_task_list(tasks: List[Dict[str, Any]], current_user: Optional[str]) -> bool: + """Render the shared active/history task list. Returns True if any active tasks exist.""" + if current_user: + st.caption(f"Logged in as **{current_user}** · your recent tasks only") + if not tasks: + render_task_list_empty_state() + return False + + active = [t for t in tasks if t.get("status") in ("pending", "running")] + history = [t for t in tasks if t.get("status") not in ("pending", "running")] + use_dialog = callable(getattr(st, "dialog", None)) + + for t in active: + _render_one_task_row(t, current_user, use_dialog, mode="active_compact") + + if history: + with st.expander(f"Task history ({len(history)})", expanded=False): + for t in history: + _render_one_task_row(t, current_user, use_dialog, mode="history") + + if use_dialog and st.session_state.get("_task_detail_id"): + task_id = st.session_state["_task_detail_id"] + try: + detail_task = get_task(task_id) + if detail_task: + + @st.dialog("Task details", width="large") + def _task_detail_modal(): + render_task_detail_content(detail_task) + if st.button("Close"): + st.session_state.pop("_task_detail_id", None) + st.rerun() + + _task_detail_modal() + except Exception as e: + st.error(f"Could not open task details: {e}") + finally: + st.session_state.pop("_task_detail_id", None) + + return len(active) > 0 + + +def get_task_list_current_user() -> Optional[str]: + """Return current user id when auth is enabled, else None.""" + return get_current_user_id() if is_auth_enabled() else None diff --git a/evaluation_dashboard_app/lib/ui/task_result_summary.py b/evaluation_dashboard_app/lib/ui/task_result_summary.py new file mode 100644 index 0000000..b7e0038 --- /dev/null +++ b/evaluation_dashboard_app/lib/ui/task_result_summary.py @@ -0,0 +1,222 @@ +"""Shared task result-summary renderers used by background task pages.""" + +from typing import Any, Dict, List, Optional + +import pandas as pd +import streamlit as st + + +def render_summary_table(rows: Optional[List[Dict[str, Any]]]) -> None: + """Render a summary table from rows (e.g. Scenario Name, Scenario ID, Status) when present.""" + if not rows: + return + try: + df = pd.DataFrame(rows) + st.subheader("Download Status") + st.dataframe(df, width="stretch") + except Exception: + pass + + +def render_task_result_summary(summary: Dict[str, Any]) -> None: + """Render a result summary block from task result_summary JSON.""" + job = summary.get("job", "") + if job == "download_results": + total = summary.get("total", 0) + success = summary.get("success", 0) + failed = summary.get("failed", 0) + out = summary.get("output_path", "") + st.subheader("Summary") + st.write(f"- Total scenarios processed: **{total}**") + st.write(f"- Successfully downloaded: **{success}**") + if failed: + st.write(f"- Failed: **{failed}**") + st.write(f"- Output directory: `{out}`") + if success > 0: + st.info("To generate the final summary CSV files, go to the **Eval Results** tab and run the evaluation.") + render_summary_table(summary.get("rows")) + elif job == "download_scenarios": + total = summary.get("total", 0) + success = summary.get("success", 0) + failed = summary.get("failed", 0) + out = summary.get("output_path", "") + st.subheader("Summary") + st.write(f"- Total scenarios: **{total}**") + st.write(f"- Successfully downloaded: **{success}**") + if failed: + st.write(f"- Failed: **{failed}**") + st.write(f"- Result JSON files: **{total}** downloaded.") + st.write(f"- Output directory: `{out}`") + if success > 0: + st.info("To generate summary CSV files, go to the **Eval Results** tab and run the evaluation.") + render_summary_table(summary.get("rows")) + elif job == "run_eval_dirs": + dirs = summary.get("directories_processed", 0) + path = summary.get("summary_path", "") + srows = summary.get("summary_rows", 0) + scrows = summary.get("score_rows", 0) + st.subheader("Eval Summary") + st.write(f"- Directories processed: **{dirs}**") + st.write(f"- Generated Summary.csv (**{srows}** rows) and Score.csv (**{scrows}** rows) in `{path}`") + elif job == "generate_summary_csv": + path = summary.get("summary_path", "") + srows = summary.get("summary_rows", 0) + scrows = summary.get("score_rows", 0) + st.subheader("Summary") + st.write(f"- Generated Summary.csv (**{srows}** rows) and Score.csv (**{scrows}** rows) in `{path}`") + elif job == "build_parquet": + path = summary.get("output_path", "") + st.subheader("Summary") + st.write(f"- Output: `{path}`") + elif job == "download_and_eval": + dl_summary = summary.get("download_summary", {}) + eval_summary_data = summary.get("eval_summary", {}) + parquet_path = summary.get("parquet_path", "") + errors = summary.get("errors", []) + + st.subheader("Download + Eval + Parquet Summary") + + dl_success = summary.get("download_success", False) + if dl_success: + st.write("✅ **Download: SUCCESS**") + st.write( + f" - Total: **{dl_summary.get('total', 0)}**, " + f"Success: **{dl_summary.get('success', 0)}**, " + f"Failed: **{dl_summary.get('failed', 0)}**" + ) + else: + st.write("❌ **Download: FAILED**") + if errors: + for err in errors: + st.write(f" - {err}") + + if eval_summary_data: + st.write("✅ **Eval: SUCCESS**") + st.write(f" - Directories processed: **{eval_summary_data.get('directories_processed', 0)}**") + st.write( + f" - Summary.csv: **{eval_summary_data.get('summary_rows', 0)}** rows, " + f"Score.csv: **{eval_summary_data.get('score_rows', 0)}** rows" + ) + + if parquet_path: + st.write(f"✅ **Parquet: SUCCESS** → `{parquet_path}`") + + if errors: + st.error("Errors during execution:") + for err in errors: + st.write(f"- {err}") + elif job == "run_evaluator_and_process": + evaluator_job_id = summary.get("evaluator_job_id", "") + evaluator_report_url = summary.get("evaluator_report_url", "") + evaluator_status = summary.get("evaluator_status", "unknown") + evaluator_build_status = summary.get("evaluator_build_status", "") + evaluator_test_status = summary.get("evaluator_test_status", "") + evaluator_fail_message = summary.get("evaluator_fail_message", "") + evaluator_case_totals = summary.get("evaluator_case_totals", {}) + evaluator_suites = summary.get("evaluator_suites", []) + evaluator_failed_cases = summary.get("evaluator_failed_cases", []) + dl_summary = summary.get("download_summary", {}) + download_rows = summary.get("download_rows", []) + eval_summary_data = summary.get("eval_summary", {}) + parquet_path = summary.get("parquet_path", "") + + st.subheader("Run Evaluator + Download + Eval + Parquet Summary") + + st.write("🎯 **Evaluator**") + st.write(f" - Job ID: `{evaluator_job_id}`") + st.write(f" - Status: **{evaluator_status}**") + if evaluator_build_status: + st.write(f" - Build: **{evaluator_build_status}**") + if evaluator_test_status: + st.write(f" - Test: **{evaluator_test_status}**") + if evaluator_case_totals: + st.write( + " - Case results: " + f"**{evaluator_case_totals.get('success', 0)}** success, " + f"**{evaluator_case_totals.get('failed', 0)}** failed, " + f"**{evaluator_case_totals.get('canceled', 0)}** canceled " + f"(total **{evaluator_case_totals.get('total', 0)}**)" + ) + if evaluator_fail_message: + st.write(f" - Message: `{evaluator_fail_message}`") + if evaluator_report_url: + st.markdown(f" - Report: [Open]({evaluator_report_url})") + if evaluator_suites: + st.caption("Evaluator suite summary") + st.dataframe(pd.DataFrame(evaluator_suites), width="stretch", hide_index=True) + if evaluator_failed_cases: + st.caption("Failed cases from evaluator") + st.dataframe(pd.DataFrame(evaluator_failed_cases), width="stretch", hide_index=True) + + dl_total = dl_summary.get("total", 0) + dl_success = dl_summary.get("success", 0) + dl_failed = dl_summary.get("failed", 0) + st.write("📥 **Download**") + st.write(f" - Total: **{dl_total}**, Success: **{dl_success}**, Failed: **{dl_failed}**") + if download_rows: + render_summary_table(download_rows) + + if eval_summary_data: + st.write("🧮 **Evaluation**") + st.write(f" - Directories processed: **{eval_summary_data.get('directories_processed', 0)}**") + st.write( + f" - Success: **{eval_summary_data.get('success', 0)}**, " + f"Failed: **{eval_summary_data.get('failed', 0)}**" + ) + st.write( + f" - Summary.csv: **{eval_summary_data.get('summary_rows', 0)}** rows, " + f"Score.csv: **{eval_summary_data.get('score_rows', 0)}** rows" + ) + + if parquet_path: + st.write("📦 **Parquet**") + st.write(f" - Output: `{parquet_path}`") + + if evaluator_report_url: + st.markdown(f"### [📊 View Evaluator Report]({evaluator_report_url})") + elif job == "run_release_specsheet_workflow": + st.subheader("Release Specsheet Summary") + st.write(f"📁 **Release root:** `{summary.get('release_root', '')}`") + st.write(f"🏷️ **Version:** `{summary.get('version', '')}`") + evaluator_jobs = summary.get("evaluator_jobs", {}) + if evaluator_jobs: + rows = [] + for role, payload in evaluator_jobs.items(): + rows.append( + { + "role": role, + "job_id": payload.get("job_id", ""), + "status": payload.get("status", ""), + "catalog_id": payload.get("catalog_id", ""), + "suite_count": payload.get("suite_count", ""), + "description": payload.get("description", ""), + "report_url": payload.get("report_url", ""), + } + ) + st.dataframe(pd.DataFrame(rows), width="stretch", hide_index=True) + analysis_artifacts = summary.get("analysis_artifacts", {}) + if analysis_artifacts: + st.write("🔎 **Detailed analysis artifacts:**") + rows = [] + for role, payload in analysis_artifacts.items(): + download = payload.get("download", {}) if isinstance(payload.get("download"), dict) else {} + eval_summary = payload.get("eval", {}) if isinstance(payload.get("eval"), dict) else {} + warnings = payload.get("warnings", []) if isinstance(payload.get("warnings"), list) else [] + rows.append( + { + "role": role, + "path": payload.get("path", ""), + "download_success": download.get("success", ""), + "download_total": download.get("total", ""), + "summary_rows": eval_summary.get("summary_rows", ""), + "score_rows": eval_summary.get("score_rows", ""), + "parquet_path": payload.get("parquet_path", ""), + "warnings": "; ".join(str(item) for item in warnings[:3]), + } + ) + st.dataframe(pd.DataFrame(rows), width="stretch", hide_index=True) + specsheet_pdf = summary.get("specsheet_pdf", "") + if specsheet_pdf: + st.write(f"✅ **Specsheet PDF:** `{specsheet_pdf}`") + else: + st.json(summary) diff --git a/evaluation_dashboard_app/pages/10_Help.py b/evaluation_dashboard_app/pages/10_Help.py index 8c9df7f..857b7b9 100644 --- a/evaluation_dashboard_app/pages/10_Help.py +++ b/evaluation_dashboard_app/pages/10_Help.py @@ -1,11 +1,9 @@ -import json import re -import uuid from pathlib import Path import streamlit as st -import streamlit.components.v1 as components +from lib.mermaid_render import render_mermaid from lib.page_chrome import inject_app_page_styles, render_page_hero st.set_page_config( @@ -18,37 +16,17 @@ render_page_hero( kicker="Documentation", title="Help & guide", - description="In-app copy of the project README — setup, pages, and workflows for the evaluation dashboard.", + description="In-app copy of the project README with a simple Japanese / English switch.", mode="Single Run", ) # Streamlit markdown does not run Mermaid; split fenced ```mermaid blocks and render via Mermaid.js. MERMAID_FENCE = re.compile(r"```mermaid\s*\n([\s\S]*?)```", re.IGNORECASE) IMAGE_PATTERN = re.compile(r"!\[(.*?)\]\((.*?)\)") - - -def _render_mermaid(definition: str) -> None: - """Render a Mermaid diagram inside an HTML component (CDN script).""" - defn_json = json.dumps(definition.strip()) - uid = uuid.uuid4().hex[:12] - html = f""" -
- - -""" - components.html(html, height=480, scrolling=True) +README_FILES = { + "Japanese": Path("Readme.md"), + "English": Path("Readme.en.md"), +} def _render_markdown_with_images(chunk: str) -> None: @@ -69,11 +47,22 @@ def _render_markdown_with_images(chunk: str) -> None: break -readme_path = Path("Readme.md") -content = readme_path.read_text(encoding="utf-8") +language = st.radio( + "README language", + options=list(README_FILES.keys()), + horizontal=True, + label_visibility="collapsed", +) + +selected_readme_path = README_FILES[language] +if not selected_readme_path.exists(): + st.error(f"README file not found: {selected_readme_path}") + st.stop() + +content = selected_readme_path.read_text(encoding="utf-8") for idx, piece in enumerate(MERMAID_FENCE.split(content)): if idx % 2 == 0: _render_markdown_with_images(piece) else: - _render_mermaid(piece) + render_mermaid(piece) diff --git a/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py b/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py new file mode 100644 index 0000000..7297d87 --- /dev/null +++ b/evaluation_dashboard_app/pages/11_T4_Dataset_Server.py @@ -0,0 +1,447 @@ +""" +Exercise the T4 visualizer HTTP API (``t4-server``): ``GET /health``, ``GET /datasets``, +``GET /datasets/{t4dataset_id}/scenarios``, and ``POST /render``. +Build embeddable JSON / query strings for T4 dataset context and render payloads. +""" +from __future__ import annotations + +import json +import os +from typing import Any, List, Optional + +import pandas as pd +import streamlit as st + +from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header +from lib.t4_dataset_embed import ( + build_render_request_embed, + t4_dataset_context, + t4_share_query_params, + target_objects_from_rows, +) +from lib.t4_visualizer_client import ( + DEFAULT_BASE_URL, + ENV_BASE_URL, + RenderRequest, + T4VisualizerClient, + T4VisualizerError, + TargetObjectIn, + render_request_to_json_body, + render_response_json_for_debug, + target_object_from_gt_row, +) + +st.set_page_config( + page_title="T4 dataset server", + page_icon="📡", + layout="wide", + initial_sidebar_state="expanded", +) +inject_app_page_styles() + +render_page_hero( + kicker="Integration", + title="T4 dataset server & embed helpers", + description=( + "Call the Tier4 visualizer HTTP service (same client as Bounding Box Viewer): health, dataset list, " + "scenarios per dataset (names and frame counts), camera render. Fetch lists, pick ids from the server " + "or type your own, then render or copy embed JSON." + ), + mode="Single Run", +) + +if "t4_test_base_url" not in st.session_state: + st.session_state["t4_test_base_url"] = os.environ.get(ENV_BASE_URL, DEFAULT_BASE_URL).rstrip("/") + +# Cached API results for pickers +if "t4_dataset_ids" not in st.session_state: + st.session_state["t4_dataset_ids"] = [] +if "t4_last_datasets_payload" not in st.session_state: + st.session_state["t4_last_datasets_payload"] = None +if "t4_scenario_rows" not in st.session_state: + st.session_state["t4_scenario_rows"] = [] +if "t4_last_scenarios_payload" not in st.session_state: + st.session_state["t4_last_scenarios_payload"] = None + + +def _hydrate_t4_from_url() -> None: + """Fill context + render/embed widgets from ``?render_json=…`` (same JSON as curl ``-d``).""" + qp = st.query_params + raw = qp.get("render_json") + if raw is None: + return + if isinstance(raw, list): + raw = raw[0] if raw else None + if not raw: + return + sig = f"render_json:{raw}" + if st.session_state.get("_t4_hydrate_sig") == sig: + return + try: + body = json.loads(str(raw)) + except json.JSONDecodeError: + return + if not isinstance(body, dict): + return + st.session_state["t4_ctx_ds"] = str(body.get("t4dataset_id", "")) + st.session_state["t4_ctx_scen"] = str(body.get("scenario_name", "")) + try: + st.session_state["t4_ctx_frame"] = int(body.get("frame_index", 0)) + except (TypeError, ValueError): + st.session_state["t4_ctx_frame"] = 0 + ver = body.get("version") + st.session_state["t4_ctx_ver"] = "" if ver is None else str(ver) + to = body.get("target_objects") + if isinstance(to, list): + tgt = json.dumps(to, ensure_ascii=False, indent=2) + st.session_state["t4_emb_rows"] = tgt + st.session_state["t4_render_targets"] = tgt + st.session_state["t4_render_use_tgt"] = len(to) > 0 + else: + st.session_state["t4_emb_rows"] = "[]" + st.session_state["t4_render_targets"] = "[]" + st.session_state["t4_render_use_tgt"] = False + st.session_state["t4_render_crop"] = bool(body.get("crop_cameras", False)) + st.session_state["t4_render_ann"] = bool(body.get("show_annotations", True)) + st.session_state["_t4_hydrate_sig"] = sig + + +_hydrate_t4_from_url() + +base_url = st.sidebar.text_input( + "Server base URL", + key="t4_test_base_url", + help=f"Override env {ENV_BASE_URL} for this session.", +) +timeout_s = st.sidebar.number_input("HTTP timeout (s)", min_value=5.0, max_value=600.0, value=120.0, step=5.0) + + +def _client() -> T4VisualizerClient: + return T4VisualizerClient(base_url=(base_url or "").strip() or DEFAULT_BASE_URL, timeout=float(timeout_s)) + + +def _bash_single_quoted(s: str) -> str: + """Wrap *s* for safe use as a bash single-quoted string (e.g. ``-d '…'``).""" + return "'" + s.replace("'", "'\"'\"'") + "'" + + +def _on_dataset_pick() -> None: + sel = st.session_state.get("t4_pick_ds", "—") + if sel != "—": + st.session_state["t4_ctx_ds"] = sel + + +def _on_scenario_pick() -> None: + sel = st.session_state.get("t4_pick_scen", "—") + if sel != "—": + st.session_state["t4_ctx_scen"] = sel + + +# --- Shared context (dataset, version, scenario, frame) --------------------------------- +section_header( + "Context", + "Fetch lists from the server, then choose **t4dataset_id** and **scenario_name** from the dropdowns " + "or type any value in the text fields.", +) + +row_fetch = st.columns([1, 1, 2]) +with row_fetch[0]: + if st.button("GET /datasets", type="primary", key="t4_btn_datasets"): + try: + d = _client().list_datasets() + st.session_state["t4_last_datasets_payload"] = d + ds = d.get("datasets") + st.session_state["t4_dataset_ids"] = [str(x) for x in ds] if isinstance(ds, list) else [] + st.session_state["t4_scenario_rows"] = [] + st.session_state["t4_last_scenarios_payload"] = None + st.success(f"OK — {len(st.session_state['t4_dataset_ids'])} dataset id(s).") + except T4VisualizerError as ex: + st.error(f"{ex} (status={ex.status_code})") + if ex.response_text: + st.code(ex.response_text[:4000], language="text") + except OSError as ex: + st.error(f"Network error: {ex}") + +with row_fetch[1]: + if st.button("GET /datasets/…/scenarios", type="primary", key="t4_btn_scenarios"): + _tid = (st.session_state.get("t4_ctx_ds") or "").strip() + if not _tid: + st.warning("Set **t4dataset_id** first.") + else: + try: + _ver = (st.session_state.get("t4_ctx_ver") or "").strip() or None + out = _client().list_dataset_scenarios(_tid, version=_ver) + st.session_state["t4_last_scenarios_payload"] = out + rows = out.get("scenarios") + st.session_state["t4_scenario_rows"] = rows if isinstance(rows, list) else [] + st.success(f"OK — {len(st.session_state['t4_scenario_rows'])} scenario(s).") + except T4VisualizerError as ex: + st.error(f"{ex} (status={ex.status_code})") + if ex.response_text: + st.code(ex.response_text[:4000], language="text") + except OSError as ex: + st.error(f"Network error: {ex}") + +with row_fetch[2]: + if st.session_state.get("t4_last_datasets_payload") is not None: + with st.expander("Last GET /datasets JSON", expanded=False): + st.json(st.session_state["t4_last_datasets_payload"]) + if st.session_state.get("t4_last_scenarios_payload") is not None: + with st.expander("Last GET /datasets/…/scenarios JSON", expanded=False): + st.json(st.session_state["t4_last_scenarios_payload"]) + +_ids = st.session_state["t4_dataset_ids"] +_ds_options = ["—"] + sorted(_ids) +_name_rows = st.session_state["t4_scenario_rows"] +_scen_names: List[str] = [] +for r in _name_rows: + if isinstance(r, dict) and r.get("name") is not None: + _scen_names.append(str(r["name"])) +_scen_options = ["—"] + sorted(set(_scen_names)) + +c1, c2, c3, c4 = st.columns(4) +with c1: + st.selectbox( + "Pick dataset (from last /datasets)", + options=_ds_options, + key="t4_pick_ds", + on_change=_on_dataset_pick, + help="Choose a server-reported id, or leave as — and type below.", + ) + st.text_input( + "t4dataset_id", + key="t4_ctx_ds", + placeholder="uuid or folder id", + ) +with c2: + st.text_input( + "version (optional)", + key="t4_ctx_ver", + help="Annotation dir version; passed to scenarios and render when non-empty.", + ) +with c3: + st.selectbox( + "Pick scenario (from last /scenarios)", + options=_scen_options, + key="t4_pick_scen", + on_change=_on_scenario_pick, + help="Choose **name** from the server, or type any scenario below.", + ) + st.text_input( + "scenario_name", + key="t4_ctx_scen", + placeholder="scene name for POST /render", + ) +with c4: + st.number_input("frame_index", min_value=0, value=0, step=1, key="t4_ctx_frame") + +if _name_rows: + st.caption( + "Valid **frame_index** for each scene is **0 … nbr_samples − 1** (see table). " + "Use **Render & embed** to request PNGs." + ) + st.dataframe(pd.DataFrame(_name_rows), width='stretch', hide_index=True) + +st.divider() + +tab_overview, tab_render = st.tabs(["Overview", "Render & embed JSON"]) + +with tab_overview: + section_header("/health", "GET — server liveness.") + if st.button("GET /health", type="primary", key="t4_btn_health"): + try: + h = _client().health() + st.success("OK") + st.json(h) + except T4VisualizerError as ex: + st.error(f"{ex} (status={ex.status_code})") + if ex.response_text: + st.code(ex.response_text[:4000], language="text") + except OSError as ex: + st.error(f"Network error: {ex}") + +with tab_render: + section_header("POST /render", "Request camera PNGs; optional ``target_objects`` from JSON below.") + ds_id = (st.session_state.get("t4_ctx_ds") or "").strip() + scen = (st.session_state.get("t4_ctx_scen") or "").strip() + frame = int(st.session_state.get("t4_ctx_frame") or 0) + ver_raw = (st.session_state.get("t4_ctx_ver") or "").strip() + version_opt: Optional[str] = ver_raw if ver_raw else None + + st.caption( + f"Using context: **t4dataset_id**=`{ds_id or '…'}` · **scenario_name**=`{scen or '…'}` · " + f"**frame_index**={frame}" + + (f" · **version**=`{version_opt}`" if version_opt else "") + ) + + tgt_json = st.text_area( + "target_objects (JSON array, optional)", + value="[]", + height=140, + key="t4_render_targets", + help="List of objects with uuid/x/y/z/label/width/length/height/yaw (matches GT row shape).", + ) + o1, o2, o3 = st.columns(3) + with o1: + crop = st.checkbox("crop_cameras", value=False, key="t4_render_crop") + with o2: + show_ann = st.checkbox("show_annotations", value=True, key="t4_render_ann") + with o3: + overlay_gt = st.checkbox("Use target_objects in request", value=True, key="t4_render_use_tgt") + + req: Optional[RenderRequest] = None + parse_err: Optional[str] = None + if overlay_gt: + try: + raw = json.loads(tgt_json or "[]") + if not isinstance(raw, list): + parse_err = "target_objects JSON must be an array" + else: + objs: List[TargetObjectIn] = [] + for item in raw: + if not isinstance(item, dict): + parse_err = "each target must be an object" + break + d = target_object_from_gt_row(item) + objs.append(TargetObjectIn(**d)) + if parse_err is None: + req = RenderRequest( + t4dataset_id=ds_id, + scenario_name=scen, + frame_index=frame, + target_objects=objs, + crop_cameras=crop, + show_annotations=show_ann, + version=version_opt, + ) + except json.JSONDecodeError as ex: + parse_err = f"Invalid JSON: {ex}" + else: + req = RenderRequest( + t4dataset_id=ds_id, + scenario_name=scen, + frame_index=frame, + target_objects=[], + crop_cameras=crop, + show_annotations=show_ann, + version=version_opt, + ) + + if parse_err: + st.warning(parse_err) + + col_go, col_prev = st.columns([1, 2]) + with col_go: + do_render = st.button("POST /render", type="primary", key="t4_btn_render", disabled=req is None) + with col_prev: + if req is not None: + with st.expander("Request body preview", expanded=False): + st.json(render_request_to_json_body(req)) + + if do_render and req is not None: + try: + with st.spinner("Rendering…"): + res = _client().render(req) + imgs = res.decode_all_images() + cap_parts = [ + f"sample_token={res.sample_token!r}", + f"timestamp_us={res.timestamp_us}", + ] + if res.elapsed_ms is not None: + cap_parts.append(f"elapsed_ms={res.elapsed_ms}") + if res.tier4_load_ms is not None: + cap_parts.append(f"tier4_load_ms={res.tier4_load_ms}") + if res.render_ms is not None: + cap_parts.append(f"render_ms={res.render_ms}") + st.caption(" · ".join(cap_parts)) + if res.raw_json is not None: + with st.expander("Response JSON (debug)", expanded=False): + st.json(render_response_json_for_debug(res.raw_json)) + if not imgs: + st.info("No images in response.") + else: + n = min(len(imgs), 6) + cols = st.columns(n) + for i in range(n): + label, png = imgs[i] + cols[i].image(png, caption=label, width='stretch') + if len(imgs) > n: + st.caption(f"Showing first {n} of {len(imgs)} images.") + except T4VisualizerError as ex: + st.error(f"{ex} (status={ex.status_code})") + if ex.response_text: + st.code(ex.response_text[:4000], language="text") + except OSError as ex: + st.error(f"Network error: {ex}") + + st.divider() + section_header( + "Embed helpers", + "Same **context** fields as above. Copy structured context, query strings, and full ``POST /render`` JSON.", + ) + + emb_ds = (st.session_state.get("t4_ctx_ds") or "").strip() + emb_scen = (st.session_state.get("t4_ctx_scen") or "").strip() + emb_frame = int(st.session_state.get("t4_ctx_frame") or 0) + + emb_ta = st.text_area( + "Optional GT rows as JSON array (for target_objects_from_rows)", + value="[]", + height=120, + key="t4_emb_rows", + ) + + rows_err: Optional[str] = None + rows_list: List[dict[str, Any]] = [] + try: + parsed = json.loads(emb_ta or "[]") + if not isinstance(parsed, list): + rows_err = "Must be a JSON array" + else: + for i, row in enumerate(parsed): + if not isinstance(row, dict): + rows_err = f"Item {i} is not an object" + break + if rows_err is None: + rows_list = [r for r in parsed if isinstance(r, dict)] + except json.JSONDecodeError as ex: + rows_err = str(ex) + + if rows_err: + st.warning(rows_err) + + ctx = t4_dataset_context(emb_ds, emb_scen, frame_index=emb_frame) + emb_ver = (st.session_state.get("t4_ctx_ver") or "").strip() + full = build_render_request_embed( + emb_ds, + emb_scen, + emb_frame, + target_rows=rows_list if rows_list else None, + show_annotations=bool(st.session_state.get("t4_render_ann", True)), + crop_cameras=bool(st.session_state.get("t4_render_crop", False)), + version=emb_ver if emb_ver else None, + ) + viz_base = (base_url or "").strip().rstrip("/") or DEFAULT_BASE_URL + q = t4_share_query_params(emb_ds, emb_scen, frame_index=emb_frame) + render_get_url = f"{viz_base}/render?{q}" + + st.subheader("Render GET URL") + st.caption( + "GET-style URL on the **visualizer server** (same **Server base URL** as API calls). " + "Requires **GET /render** with ``t4dataset_id``, ``scenario_name``, ``frame_index``; otherwise use **curl** (POST JSON) below." + ) + st.markdown(f"[{render_get_url}]({render_get_url})") + + if rows_list: + st.subheader("target_objects_from_rows (preview)") + st.json(target_objects_from_rows(rows_list)) + + curl_base = (base_url or "").strip() or DEFAULT_BASE_URL + body_pretty = json.dumps(full["post_render_json"], indent=2, ensure_ascii=False) + curl_lines = ( + f"curl -sS {curl_base}/render \\\n" + f" -H 'Content-Type: application/json' \\\n" + f" -d {_bash_single_quoted(body_pretty)}" + ) + st.subheader("curl") + st.code(curl_lines, language="bash") diff --git a/evaluation_dashboard_app/pages/12_Prediction_Evaluation.py b/evaluation_dashboard_app/pages/12_Prediction_Evaluation.py new file mode 100644 index 0000000..515990c --- /dev/null +++ b/evaluation_dashboard_app/pages/12_Prediction_Evaluation.py @@ -0,0 +1,1038 @@ +from __future__ import annotations + +import json +from pathlib import Path +from typing import Callable + +import numpy as np +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import pyarrow.parquet as pq +import streamlit as st + +from lib.overview_url_hydrate import try_hydrate_session_from_overview_query_params +from lib.page_chrome import inject_app_page_styles, render_loaded_data_section, render_page_hero, section_header +from lib.path_utils import get_run_display_name, list_run_directories, path_display +from lib.prediction_eval import build_specsheet_aligned_prediction_artifacts + + +st.set_page_config( + layout="wide", + page_title="Prediction Evaluation", + page_icon="🧭", + initial_sidebar_state="expanded", +) +inject_app_page_styles() +st.markdown( + """ + + """, + unsafe_allow_html=True, +) + +PLOTLY_COLORS = { + "ink": "#12344d", + "teal": "#0f766e", + "blue": "#1d4ed8", + "amber": "#c27803", + "rose": "#be123c", + "slate": "#475569", +} +DEFAULT_TOPIC = "perception.object_recognition.objects" +CHECKPOINTS = (1.0, 3.0, 5.0) +METRIC_ORDER = [ + "minADE@1s", + "minADE@3s", + "minADE@5s", + "minFDE@1s", + "minFDE@3s", + "minFDE@5s", +] +APP_CACHE_ROOT = ".dashboard_cache" +ARTIFACT_DIRNAME = "prediction_eval_cache" +PREDICTION_CACHE_VERSION = 4 +ARTIFACT_TABLES = ["label_summary", "distance_summary", "polar_summary"] +R_MAX, R_STEP, R_INI = 200, 20, 0 +THETA_STEP, THETA_INI = 60, -60 +THETA_MAX = THETA_INI + 360 +R_LABELS = [f"{i}-{i + R_STEP}" for i in range(R_INI, R_MAX, R_STEP)] +R_EDGES = np.arange(R_INI, R_MAX + R_STEP, R_STEP) +THETA_LABELS = [f"{i}-{i + THETA_STEP}" for i in range(THETA_INI, THETA_MAX, THETA_STEP)] +THETA_EDGES_DEG = np.arange(THETA_INI, THETA_MAX + THETA_STEP, THETA_STEP) +DISTANCE_BIN_ORDER = [ + "0-20 m", + "20-40 m", + "40-60 m", + "60-80 m", + "80-100 m", + "100-120 m", + "120-140 m", + "140-160 m", + "160-180 m", + "180-200 m", + "200+ m", +] + + +def render_stat_card(kicker: str, value: str, note: str) -> None: + st.markdown( + f""" +
+
{kicker}
+
{value}
+
{note}
+
+ """, + unsafe_allow_html=True, + ) + + +def ordered_distance_bins(values: list[str] | pd.Index) -> list[str]: + present = {str(v) for v in values if pd.notna(v)} + canonical_order = DISTANCE_BIN_ORDER + R_LABELS + ordered = [v for v in canonical_order if v in present] + leftovers = sorted(present - set(ordered), key=lambda v: (len(v), v)) + return ordered + leftovers + + +def build_distance_ring_figure(metric_df: pd.DataFrame, label_order: list[str], metric_name: str) -> go.Figure: + ring_order = ordered_distance_bins(metric_df["r"].tolist()) + pivot = ( + metric_df.pivot(index="label", columns="r", values="value") + .reindex(index=label_order) + .reindex(columns=ring_order) + ) + theta_width = 360 / max(len(label_order), 1) + theta_centers = [i * theta_width for i in range(len(label_order))] + zmin = float(np.nanmin(pivot.values)) if np.isfinite(np.nanmin(pivot.values)) else 0.0 + zmax = float(np.nanmax(pivot.values)) if np.isfinite(np.nanmax(pivot.values)) else 1.0 + if zmin == zmax: + zmax = zmin + 1.0 + + fig = go.Figure() + for ring_idx, ring_name in enumerate(ring_order): + vals = pivot[ring_name].tolist() + fig.add_trace( + go.Barpolar( + r=[1.0] * len(label_order), + base=[ring_idx] * len(label_order), + theta=theta_centers, + width=[theta_width * 0.92] * len(label_order), + marker=dict( + color=vals, + colorscale="YlOrRd", + cmin=zmin, + cmax=zmax, + line=dict(color="rgba(255,255,255,0.35)", width=1), + colorbar=dict(title="m") if ring_idx == len(ring_order) - 1 else None, + ), + customdata=np.array([[label_order[i], ring_name, vals[i]] for i in range(len(label_order))], dtype=object), + hovertemplate="label=%{customdata[0]}
distance=%{customdata[1]}
value=%{customdata[2]:.3f} m", + showlegend=False, + ) + ) + + fig.update_layout( + title=metric_name, + height=430, + margin=dict(l=10, r=10, t=55, b=10), + polar=dict( + radialaxis=dict( + tickmode="array", + tickvals=list(range(len(ring_order))), + ticktext=ring_order, + angle=90, + gridcolor="rgba(148,163,184,0.25)", + ), + angularaxis=dict( + tickmode="array", + tickvals=theta_centers, + ticktext=label_order, + rotation=90, + direction="clockwise", + gridcolor="rgba(148,163,184,0.20)", + ), + bgcolor="rgba(248,250,252,0.75)", + ), + ) + return fig + + +def build_theta_ring_figure(label_polar: pd.DataFrame, metric_name: str, label_name: str, value_col: str, *, delta_mode: bool) -> go.Figure: + theta_order = THETA_LABELS + radial_order = [r for r in R_LABELS if r in set(label_polar["r"].astype(str))] + pivot = ( + label_polar.pivot(index="r", columns="theta", values=value_col) + .reindex(index=radial_order, columns=theta_order) + ) + theta_width = 360 / max(len(theta_order), 1) + theta_centers = [i * theta_width for i in range(len(theta_order))] + + values = pivot.values.astype(float) if pivot.size else np.array([[0.0]]) + finite = values[np.isfinite(values)] + if finite.size == 0: + zmin, zmax = (-1.0, 1.0) if delta_mode else (0.0, 1.0) + else: + if delta_mode: + bound = float(np.nanmax(np.abs(finite))) or 1.0 + zmin, zmax = -bound, bound + else: + zmin, zmax = float(np.nanmin(finite)), float(np.nanmax(finite)) + if zmin == zmax: + zmax = zmin + 1.0 + + fig = go.Figure() + for ring_idx, ring_name in enumerate(radial_order): + vals = pivot.loc[ring_name].tolist() + fig.add_trace( + go.Barpolar( + r=[1.0] * len(theta_order), + base=[ring_idx] * len(theta_order), + theta=theta_centers, + width=[theta_width * 0.92] * len(theta_order), + marker=dict( + color=vals, + colorscale="RdBu" if delta_mode else "YlOrRd", + cmin=zmin, + cmax=zmax, + line=dict(color="rgba(255,255,255,0.32)", width=1), + colorbar=dict(title="m") if ring_idx == len(radial_order) - 1 else None, + ), + customdata=np.array([[theta_order[i], ring_name, vals[i]] for i in range(len(theta_order))], dtype=object), + hovertemplate=("theta=%{customdata[0]}
distance=%{customdata[1]}
Δ=%{customdata[2]:+.3f} m" if delta_mode else "theta=%{customdata[0]}
distance=%{customdata[1]}
value=%{customdata[2]:.3f} m"), + showlegend=False, + ) + ) + + fig.update_layout( + title=f"{label_name}{' (B - A)' if delta_mode else ''}", + height=320, + margin=dict(l=10, r=10, t=45, b=10), + polar=dict( + radialaxis=dict( + tickmode="array", + tickvals=list(range(len(radial_order))), + ticktext=radial_order, + angle=90, + gridcolor="rgba(148,163,184,0.22)", + ), + angularaxis=dict( + tickmode="array", + tickvals=theta_centers, + ticktext=theta_order, + rotation=90, + direction="clockwise", + gridcolor="rgba(148,163,184,0.18)", + ), + bgcolor="rgba(248,250,252,0.75)", + ), + ) + return fig + + +def render_compare_stat_card(kicker: str, a_value: float | None, b_value: float | None, note: str) -> None: + delta = None + if a_value is not None and b_value is not None and pd.notna(a_value) and pd.notna(b_value): + delta = float(b_value) - float(a_value) + delta_text = f"Δ {delta:+.2f} m" if delta is not None else "Δ n/a" + st.markdown( + f""" +
+
{kicker}
+
A {a_value:.2f} / B {b_value:.2f}
+
{delta_text}
{note}
+
+ """, + unsafe_allow_html=True, + ) + + +def _run_has_prediction_source(run_path: Path) -> bool: + return (run_path / "future.parquet").exists() or (run_path / "future.csv").exists() + + +def _prediction_source_path(run_path: Path) -> Path | None: + csv_path = run_path / "future.csv" + if csv_path.exists(): + return csv_path + parquet_path = run_path / "future.parquet" + if parquet_path.exists(): + return parquet_path + return None + + +@st.cache_data(show_spinner=False) +def load_prediction_metadata(run_path_str: str) -> dict[str, float | int]: + future_path = _prediction_source_path(Path(run_path_str)) + if future_path is None: + return {"row_count": 0, "row_groups": 0, "file_size_mb": 0.0, "source_kind": "missing"} + if future_path.suffix == ".parquet": + parquet_file = pq.ParquetFile(future_path) + return { + "row_count": int(parquet_file.metadata.num_rows), + "row_groups": int(parquet_file.metadata.num_row_groups), + "file_size_mb": future_path.stat().st_size / (1024 * 1024), + "source_kind": "parquet", + } + return { + "row_count": 0, + "row_groups": 0, + "file_size_mb": future_path.stat().st_size / (1024 * 1024), + "source_kind": "csv", + } + + +def get_prediction_cache_dir(run_path: Path) -> Path: + return run_path / APP_CACHE_ROOT / ARTIFACT_DIRNAME + + +def get_prediction_manifest_path(run_path: Path) -> Path: + return get_prediction_cache_dir(run_path) / "manifest.json" + + +def get_prediction_table_path(run_path: Path, table_name: str) -> Path: + return get_prediction_cache_dir(run_path) / f"{table_name}.parquet" + + +def load_prediction_artifact_manifest(run_path: Path) -> dict[str, object] | None: + manifest_path = get_prediction_manifest_path(run_path) + if not manifest_path.exists(): + return None + try: + return json.loads(manifest_path.read_text(encoding="utf-8")) + except Exception: + return None + + +def prediction_artifacts_ready(run_path: Path) -> bool: + manifest = load_prediction_artifact_manifest(run_path) + future_path = _prediction_source_path(run_path) + if manifest is None or future_path is None or not future_path.exists(): + return False + if manifest.get("cache_version") != PREDICTION_CACHE_VERSION: + return False + if manifest.get("future_mtime_ns") != future_path.stat().st_mtime_ns: + return False + return all(get_prediction_table_path(run_path, name).exists() for name in ARTIFACT_TABLES) + + +def _noop_progress(_: float, __: str) -> None: + return None + + +def save_prediction_artifacts( + run_path: Path, + artifacts: dict[str, pd.DataFrame], + progress_callback: Callable[[float, str], None] | None = None, +) -> None: + report = progress_callback or _noop_progress + cache_dir = get_prediction_cache_dir(run_path) + cache_dir.mkdir(parents=True, exist_ok=True) + total_tables = max(len(ARTIFACT_TABLES), 1) + for idx, name in enumerate(ARTIFACT_TABLES, start=1): + report(0.88 + (0.09 * idx / total_tables), f"Saving `{name}` summary...") + artifacts[name].to_parquet(get_prediction_table_path(run_path, name), index=False) + manifest = { + "cache_version": PREDICTION_CACHE_VERSION, + "future_mtime_ns": _prediction_source_path(run_path).stat().st_mtime_ns, + "table_names": ARTIFACT_TABLES, + } + get_prediction_manifest_path(run_path).write_text(json.dumps(manifest, indent=2), encoding="utf-8") + report(1.0, "Prediction summary cache is ready.") + + +@st.cache_data(show_spinner=False) +def load_saved_prediction_artifacts(run_path_str: str) -> dict[str, pd.DataFrame]: + run_path = Path(run_path_str) + out: dict[str, pd.DataFrame] = {} + for name in ARTIFACT_TABLES: + out[name] = pd.read_parquet(get_prediction_table_path(run_path, name)) + return out + + +def _build_prediction_eval_artifacts_impl( + run_path_str: str, + progress_callback: Callable[[float, str], None] | None = None, +) -> dict[str, pd.DataFrame]: + report = progress_callback or _noop_progress + run_path = Path(run_path_str) + future_path = _prediction_source_path(run_path) + if future_path is None: + raise FileNotFoundError(f"No future.parquet or future.csv found in {run_path}") + report(0.05, f"Reading `{future_path.name}`...") + future_cols = [ + "source", + "label", + "x", + "y", + "tx", + "ty", + "mode", + "future_index", + "relative_time", + "pair_uuid", + "frame_index", + "scenario_name", + "suite_name", + "uuid", + "confidence", + ] + if future_path.suffix == ".parquet": + schema = pq.read_schema(future_path).names + optional_cols = [c for c in ["topic_name"] if c in schema] + future_df = pd.read_parquet(future_path, columns=future_cols + optional_cols) + else: + future_df = pd.read_csv(future_path, usecols=lambda c: c in set(future_cols + ["topic_name"])) + if "topic_name" in future_df.columns: + report(0.18, "Filtering the default prediction topic...") + topic_values = future_df["topic_name"].dropna().astype(str).unique().tolist() + if DEFAULT_TOPIC in topic_values: + future_df = future_df[future_df["topic_name"].astype(str) == DEFAULT_TOPIC].copy() + + report(0.3, "Matching prediction tracks against GT...") + report(0.45, "Computing per-track ADE/FDE summaries...") + report(0.62, "Aggregating metrics with specsheet-aligned distance-bin averaging...") + + def report_aggregate_progress(inner_fraction: float, message: str) -> None: + report(0.3 + (0.54 * inner_fraction), message) + + artifacts = build_specsheet_aligned_prediction_artifacts( + future_df, + checkpoints=CHECKPOINTS, + time_step=0.1, + max_error_m=100.0, + progress_callback=report_aggregate_progress, + ) + if artifacts["label_summary"].empty: + report(0.85, "No matched tracks were found. Creating empty summary tables...") + else: + report(0.84, "Finalizing overall summary row...") + return artifacts + + +@st.cache_data(show_spinner=False) +def build_prediction_eval_artifacts(run_path_str: str) -> dict[str, pd.DataFrame]: + return _build_prediction_eval_artifacts_impl(run_path_str) + + +def build_prediction_artifacts_with_progress(run_path: Path, build_label: str) -> None: + progress_slot = st.empty() + status_slot = st.empty() + progress_bar = progress_slot.progress(0, text=f"Starting {build_label} prediction summary build...") + + def report(fraction: float, message: str) -> None: + bounded_fraction = max(0.0, min(1.0, float(fraction))) + progress_bar.progress(int(round(bounded_fraction * 100)), text=message) + status_slot.caption(f"{build_label}: {message}") + + artifacts = _build_prediction_eval_artifacts_impl(str(run_path), progress_callback=report) + save_prediction_artifacts(run_path, artifacts, progress_callback=report) + st.cache_data.clear() + st.rerun() + + +def merge_label_compare(label_a: pd.DataFrame, label_b: pd.DataFrame) -> pd.DataFrame: + merged = label_a.merge(label_b, on="label", how="outer", suffixes=("_A", "_B")) + for metric in METRIC_ORDER: + merged[f"{metric}_delta"] = merged[f"{metric}_B"] - merged[f"{metric}_A"] + return merged + + +def merge_distance_compare(distance_a: pd.DataFrame, distance_b: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: + base = distance_a.copy() + base["run"] = "A" + cand = distance_b.copy() + cand["run"] = "B" + both = pd.concat([base, cand], ignore_index=True) + delta = distance_a.merge(distance_b, on=["label", "metric", "r"], how="outer", suffixes=("_A", "_B")) + delta["value_delta"] = delta["value_B"] - delta["value_A"] + return both, delta + + +def merge_polar_compare(polar_a: pd.DataFrame, polar_b: pd.DataFrame) -> pd.DataFrame: + delta = polar_a.merge(polar_b, on=["label", "metric", "r", "theta"], how="outer", suffixes=("_A", "_B")) + delta["value_delta"] = delta["value_B"] - delta["value_A"] + return delta + + +run_dirs = list_run_directories() +run_dirs = [p for p in run_dirs if _run_has_prediction_source(p)] +run_names = [get_run_display_name(p) for p in run_dirs] +if not run_names: + st.warning("No run directories with `future.parquet` or `future.csv` found under `data/`.") + st.stop() + +try_hydrate_session_from_overview_query_params() +mode_default = "Compare Mode" if st.session_state.get("mode") == "Compare Mode" else "Single Run" +mode = st.sidebar.selectbox("Mode", ["Single Run", "Compare Mode"], index=0 if mode_default == "Single Run" else 1) + +session_run_path = st.session_state.get("runA", {}).get("path") if st.session_state.get("runA") else None +default_run_name = get_run_display_name(session_run_path) if isinstance(session_run_path, Path) else run_names[0] +if default_run_name not in run_names: + default_run_name = run_names[0] + +selected_run_a = st.sidebar.selectbox( + "Baseline (A)" if mode == "Compare Mode" else "Run", + run_names, + index=run_names.index(default_run_name), + help="Select a run directory containing `future.parquet`.", +) +selected_run_b = None +if mode == "Compare Mode": + compare_candidates = [n for n in run_names if n != selected_run_a] or run_names + default_b = st.session_state.get("runB", {}).get("path").name if st.session_state.get("runB") else compare_candidates[0] + if default_b not in compare_candidates: + default_b = compare_candidates[0] + selected_run_b = st.sidebar.selectbox("Candidate (B)", compare_candidates, index=compare_candidates.index(default_b)) + +run_path_a = next(p for p in run_dirs if get_run_display_name(p) == selected_run_a) +run_path_b = next((p for p in run_dirs if get_run_display_name(p) == selected_run_b), None) +metadata_a = load_prediction_metadata(str(run_path_a)) +cache_ready_a = prediction_artifacts_ready(run_path_a) +metadata_b = load_prediction_metadata(str(run_path_b)) if run_path_b is not None else None +cache_ready_b = prediction_artifacts_ready(run_path_b) if run_path_b is not None else False + +if mode == "Compare Mode" and run_path_b is not None: + render_loaded_data_section( + [ + ("Baseline · A", path_display(run_path_a)), + ("Candidate · B", path_display(run_path_b)), + ] + ) +else: + render_loaded_data_section([("Prediction run", path_display(run_path_a))]) +render_page_hero( + kicker="Prediction quality", + title="Prediction evaluation", + description=( + "ADE/FDE summaries from `future.parquet`, computed from the cached prediction summary artifacts " + "and presented as interactive cards, ladders, and polar maps." + ), + mode=mode, + secondary_badge_inner_html="Prediction cache", +) +st.markdown( + f""" +
+
A: {int(metadata_a['row_count']):,} future rows
+
A: {metadata_a['file_size_mb']:.1f} MB {metadata_a['source_kind']}
+
A cache: {'ready' if cache_ready_a else 'not built'}
+ {f'
B: {int(metadata_b["row_count"]):,} future rows
' if metadata_b else ''} + {f'
B: {metadata_b["file_size_mb"]:.1f} MB {metadata_b["source_kind"]}
' if metadata_b else ''} + {f'
B cache: {"ready" if cache_ready_b else "not built"}
' if metadata_b else ''} +
+ """, + unsafe_allow_html=True, +) + +build_col, info_col = st.columns([0.34, 0.66]) +with build_col: + build_clicked_a = st.button("Build A Summary", type="primary", use_container_width=True) + build_clicked_b = st.button("Build B Summary", use_container_width=True) if mode == "Compare Mode" and run_path_b is not None else False +with info_col: + if mode == "Compare Mode": + status_lines = [ + f"A `{selected_run_a}`: {'ready' if cache_ready_a else 'not built'}", + f"B `{selected_run_b}`: {'ready' if cache_ready_b else 'not built'}" if selected_run_b else "", + ] + if cache_ready_a and cache_ready_b: + st.success("Compare result is ready. Both cached summaries are available.") + else: + needed = [] + if not cache_ready_a: + needed.append("Build A Summary") + if not cache_ready_b: + needed.append("Build B Summary") + st.info("Compare mode status:\n\n" + "\n\n".join([x for x in status_lines if x]) + f"\n\nNext step: press {' and '.join(needed)}.") + elif cache_ready_a: + st.success("Compact ADE/FDE summary tables are available for fast loading.") + else: + st.info(f"Run `{selected_run_a}` is not cached yet. Press Build A Summary to generate the result.") + +if build_clicked_a: + build_prediction_artifacts_with_progress(run_path_a, "A") + +if build_clicked_b and run_path_b is not None: + build_prediction_artifacts_with_progress(run_path_b, "B") + +if (mode == "Single Run" and not cache_ready_a) or (mode == "Compare Mode" and (not cache_ready_a or not cache_ready_b)): + section_header( + "Build Once, Open Fast", + "This page now stays responsive by loading only precomputed ADE/FDE summaries instead of processing the full future parquet on navigation.", + ) + st.stop() + +artifacts_a = load_saved_prediction_artifacts(str(run_path_a)) +label_summary = artifacts_a["label_summary"].copy() +distance_summary = artifacts_a["distance_summary"].copy() +polar_summary = artifacts_a["polar_summary"].copy() +artifacts_b = load_saved_prediction_artifacts(str(run_path_b)) if mode == "Compare Mode" and run_path_b is not None else None + +if label_summary.empty: + st.warning("No prediction summary data is available for this run.") + st.stop() + +available_labels = [x for x in label_summary["label"].astype(str).tolist() if x != "All"] + +overall_row = label_summary[label_summary["label"].astype(str) == "All"] +if overall_row.empty: + overall_row = label_summary.head(1) +overall = overall_row.iloc[0] +compare_label = merge_label_compare(label_summary, artifacts_b["label_summary"]) if artifacts_b is not None else None +distance_both = distance_delta = None +polar_delta = None +if artifacts_b is not None: + distance_both, distance_delta = merge_distance_compare(distance_summary, artifacts_b["distance_summary"]) + polar_delta = merge_polar_compare(polar_summary, artifacts_b["polar_summary"]) + +section_header( + "At A Glance", + "These cards mirror the kind of abstract specsheet readout we need in product review, but in a faster dashboard form.", +) +cards = st.columns(3) +with cards[0]: + if compare_label is not None: + overall_cmp = compare_label[compare_label["label"] == "All"].iloc[0] + render_compare_stat_card("minADE@1s <= 60m", overall_cmp["minADE@1s_A"], overall_cmp["minADE@1s_B"], "Best-of-K average displacement error within the near operating zone.") + else: + render_stat_card("minADE@1s <= 60m", f"{overall['minADE@1s']:.2f} m" if pd.notna(overall["minADE@1s"]) else "n/a", "Best-of-K average displacement error within the near operating zone.") +with cards[1]: + if compare_label is not None: + overall_cmp = compare_label[compare_label["label"] == "All"].iloc[0] + render_compare_stat_card("minADE@3s <= 60m", overall_cmp["minADE@3s_A"], overall_cmp["minADE@3s_B"], "Mid-horizon shape fidelity aligned with the specsheet future metric.") + else: + render_stat_card("minADE@3s <= 60m", f"{overall['minADE@3s']:.2f} m" if pd.notna(overall["minADE@3s"]) else "n/a", "Mid-horizon shape fidelity aligned with the specsheet future metric.") +with cards[2]: + if compare_label is not None: + overall_cmp = compare_label[compare_label["label"] == "All"].iloc[0] + render_compare_stat_card("minFDE@3s <= 60m", overall_cmp["minFDE@3s_A"], overall_cmp["minFDE@3s_B"], "Where the endpoint lands matters most in review discussions, so this gets prime placement.") + else: + render_stat_card("minFDE@3s <= 60m", f"{overall['minFDE@3s']:.2f} m" if pd.notna(overall["minFDE@3s"]) else "n/a", "Where the endpoint lands matters most in review discussions, so this gets prime placement.") + +cards2 = st.columns(3) +with cards2[0]: + if compare_label is not None: + overall_cmp = compare_label[compare_label["label"] == "All"].iloc[0] + render_compare_stat_card("minADE@5s <= 60m", overall_cmp["minADE@5s_A"], overall_cmp["minADE@5s_B"], "Longer horizon path quality, still scoped to the near-range summary window.") + else: + render_stat_card("minADE@5s <= 60m", f"{overall['minADE@5s']:.2f} m" if pd.notna(overall["minADE@5s"]) else "n/a", "Longer horizon path quality, still scoped to the near-range summary window.") +with cards2[1]: + if compare_label is not None: + overall_cmp = compare_label[compare_label["label"] == "All"].iloc[0] + render_compare_stat_card("minFDE@1s <= 60m", overall_cmp["minFDE@1s_A"], overall_cmp["minFDE@1s_B"], "Short horizon endpoint stability.") + else: + render_stat_card("minFDE@1s <= 60m", f"{overall['minFDE@1s']:.2f} m" if pd.notna(overall["minFDE@1s"]) else "n/a", "Short horizon endpoint stability.") +with cards2[2]: + if compare_label is not None: + overall_cmp = compare_label[compare_label["label"] == "All"].iloc[0] + render_compare_stat_card("minFDE@5s <= 60m", overall_cmp["minFDE@5s_A"], overall_cmp["minFDE@5s_B"], "Longest specsheet-style endpoint metric.") + else: + render_stat_card("minFDE@5s <= 60m", f"{overall['minFDE@5s']:.2f} m" if pd.notna(overall["minFDE@5s"]) else "n/a", f"Longest specsheet-style endpoint metric. Source rows processed: {int(overall['future_rows']):,}.") + +section_header( + "Label Performance", + "All labels are shown together so you can compare actor classes without touching filters.", +) +label_view = label_summary[label_summary["label"].isin(available_labels)].copy() +if compare_label is not None: + cmp_view = compare_label[compare_label["label"].isin(available_labels)].copy() + delta_long = cmp_view.melt( + id_vars=["label"], + value_vars=[f"{m}_delta" for m in METRIC_ORDER], + var_name="metric", + value_name="value", + ) + delta_long["metric"] = delta_long["metric"].str.replace("_delta", "", regex=False) + heat = delta_long.pivot(index="label", columns="metric", values="value").reindex(columns=METRIC_ORDER) + fig = go.Figure( + data=go.Heatmap( + z=heat.values, + x=list(heat.columns), + y=list(heat.index), + colorscale="RdBu", + zmid=0, + text=[[f"{v:+.2f}" if pd.notna(v) else "-" for v in row] for row in heat.values], + texttemplate="%{text}", + hovertemplate="label=%{y}
metric=%{x}
Δ=%{z:+.3f} m", + ) + ) + fig.update_layout( + title="ADE/FDE delta matrix: B - A within <= 60m", + xaxis_title="Metric", + yaxis_title="Label", + height=max(360, 70 * len(heat.index)), + margin=dict(l=10, r=10, t=55, b=10), + ) + st.plotly_chart(fig, width="stretch") +elif not label_view.empty: + label_long = label_view.melt( + id_vars=["label"], + value_vars=METRIC_ORDER, + var_name="metric", + value_name="value", + ) + heat = label_long.pivot(index="label", columns="metric", values="value").reindex(columns=METRIC_ORDER) + fig = go.Figure( + data=go.Heatmap( + z=heat.values, + x=list(heat.columns), + y=list(heat.index), + colorscale="YlOrRd", + text=[[f"{v:.2f}" if pd.notna(v) else "-" for v in row] for row in heat.values], + texttemplate="%{text}", + hovertemplate="label=%{y}
metric=%{x}
value=%{z:.3f} m", + ) + ) + fig.update_layout( + title="ADE/FDE matrix within <= 60m", + xaxis_title="Metric", + yaxis_title="Label", + height=max(360, 70 * len(heat.index)), + margin=dict(l=10, r=10, t=55, b=10), + ) + st.plotly_chart(fig, width="stretch") + +section_header( + "Distance Ladder", + "Compare mode defaults to clearer views than a 14-line overlay: delta heatmaps, label small multiples, and the original raw lines only as a fallback.", +) +distance_view = distance_both if distance_both is not None else distance_summary[distance_summary["label"].isin(available_labels)].copy() +if distance_both is not None and not distance_view.empty: + compare_tabs = st.tabs(["Delta Heatmap", "Label Small Multiples", "Raw Lines"]) + with compare_tabs[0]: + for start in range(0, len(METRIC_ORDER), 3): + metric_chunk = METRIC_ORDER[start : start + 3] + cols = st.columns(len(metric_chunk)) + for col, metric_name in zip(cols, metric_chunk): + with col: + metric_delta = distance_delta[ + (distance_delta["metric"] == metric_name) + & (distance_delta["label"].isin(available_labels)) + ].copy() + if metric_delta.empty: + st.caption(f"{metric_name}: no data") + continue + col_order = ordered_distance_bins(metric_delta["r"].tolist()) + pivot = ( + metric_delta.pivot(index="label", columns="r", values="value_delta") + .reindex(index=available_labels) + .reindex(columns=col_order) + ) + fig = go.Figure( + data=go.Heatmap( + z=pivot.values, + x=[str(v) for v in pivot.columns], + y=[str(v) for v in pivot.index], + colorscale="RdBu", + zmid=0, + text=[[f"{v:+.2f}" if pd.notna(v) else "-" for v in row] for row in pivot.values], + texttemplate="%{text}", + hovertemplate="label=%{y}
r=%{x}
Δ=%{z:+.3f} m", + ) + ) + fig.update_layout( + title=metric_name, + xaxis_title="Radius bin", + yaxis_title="Label", + height=max(320, 54 * len(available_labels)), + margin=dict(l=10, r=10, t=45, b=10), + ) + st.plotly_chart(fig, width="stretch", key=f"distance_delta_{metric_name}") + with compare_tabs[1]: + metric_tabs = st.tabs(METRIC_ORDER) + for metric_name, metric_tab in zip(METRIC_ORDER, metric_tabs): + with metric_tab: + metric_view = distance_view[ + (distance_view["metric"] == metric_name) + & (distance_view["label"].isin(available_labels)) + ].copy() + if metric_view.empty: + st.info(f"No data for {metric_name}.") + continue + metric_view["r"] = pd.Categorical(metric_view["r"], categories=ordered_distance_bins(metric_view["r"].tolist()), ordered=True) + for start in range(0, len(available_labels), 3): + chunk = available_labels[start : start + 3] + cols = st.columns(len(chunk)) + for col, label_name in zip(cols, chunk): + with col: + label_df = metric_view[metric_view["label"] == label_name].copy() + if label_df.empty: + st.caption(f"{label_name}: no data") + continue + fig = px.line( + label_df, + x="r", + y="value", + color="run", + markers=True, + labels={"r": "Radius bin", "value": "Error (m)", "run": "Run"}, + title=label_name, + color_discrete_map={"A": PLOTLY_COLORS["ink"], "B": PLOTLY_COLORS["amber"]}, + ) + fig.update_layout(height=280, margin=dict(l=10, r=10, t=45, b=10), legend_title="Run") + st.plotly_chart(fig, width="stretch", key=f"distance_small_{metric_name}_{label_name}") + with compare_tabs[2]: + fig = px.line( + distance_view[distance_view["label"].isin(available_labels)], + x="r", + y="value", + color="label", + line_dash="run", + markers=True, + facet_col="metric", + facet_col_wrap=3, + category_orders={"r": ordered_distance_bins(distance_view["r"].tolist())}, + labels={"r": "Radius bin (m)", "value": "Error (m)", "label": "Label", "run": "Run"}, + title="ADE/FDE by distance bin: A vs B", + color_discrete_sequence=[ + PLOTLY_COLORS["ink"], + PLOTLY_COLORS["blue"], + PLOTLY_COLORS["teal"], + PLOTLY_COLORS["amber"], + PLOTLY_COLORS["rose"], + PLOTLY_COLORS["slate"], + "#8b5cf6", + ], + ) + fig.update_layout(height=760, margin=dict(l=10, r=10, t=55, b=10), legend_title="Label / Run") + fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1])) + st.plotly_chart(fig, width="stretch", key="distance_raw_compare") +elif not distance_view.empty: + single_tabs = st.tabs(["Lines", "Metric Heatmaps", "Circular Rings", "Label Small Multiples"]) + with single_tabs[0]: + fig = px.line( + distance_view, + x="r", + y="value", + color="label", + markers=True, + facet_col="metric", + facet_col_wrap=3, + category_orders={"r": ordered_distance_bins(distance_view["r"].tolist())}, + labels={"r": "Radius bin (m)", "value": "Error (m)", "label": "Label"}, + title="ADE/FDE by distance bin", + color_discrete_sequence=[ + PLOTLY_COLORS["ink"], + PLOTLY_COLORS["blue"], + PLOTLY_COLORS["teal"], + PLOTLY_COLORS["amber"], + PLOTLY_COLORS["rose"], + PLOTLY_COLORS["slate"], + "#8b5cf6", + ], + ) + fig.update_layout(height=760, margin=dict(l=10, r=10, t=55, b=10), legend_title="Label") + fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1])) + st.plotly_chart(fig, width="stretch", key="distance_single_lines") + with single_tabs[1]: + for start in range(0, len(METRIC_ORDER), 3): + metric_chunk = METRIC_ORDER[start : start + 3] + cols = st.columns(len(metric_chunk)) + for col, metric_name in zip(cols, metric_chunk): + with col: + metric_df = distance_view[ + (distance_view["metric"] == metric_name) + & (distance_view["label"].isin(available_labels)) + ].copy() + if metric_df.empty: + st.caption(f"{metric_name}: no data") + continue + col_order = ordered_distance_bins(metric_df["r"].tolist()) + pivot = ( + metric_df.pivot(index="label", columns="r", values="value") + .reindex(index=available_labels) + .reindex(columns=col_order) + ) + fig = go.Figure( + data=go.Heatmap( + z=pivot.values, + x=[str(v) for v in pivot.columns], + y=[str(v) for v in pivot.index], + colorscale="YlOrRd", + text=[[f"{v:.2f}" if pd.notna(v) else "-" for v in row] for row in pivot.values], + texttemplate="%{text}", + hovertemplate="label=%{y}
r=%{x}
value=%{z:.3f} m", + ) + ) + fig.update_layout( + title=metric_name, + xaxis_title="Radius bin", + yaxis_title="Label", + height=max(320, 54 * len(available_labels)), + margin=dict(l=10, r=10, t=45, b=10), + ) + st.plotly_chart(fig, width="stretch", key=f"distance_single_heat_{metric_name}") + with single_tabs[2]: + for start in range(0, len(METRIC_ORDER), 2): + metric_chunk = METRIC_ORDER[start : start + 2] + cols = st.columns(len(metric_chunk)) + for col, metric_name in zip(cols, metric_chunk): + with col: + metric_df = distance_view[ + (distance_view["metric"] == metric_name) + & (distance_view["label"].isin(available_labels)) + ].copy() + if metric_df.empty: + st.caption(f"{metric_name}: no data") + continue + fig = build_distance_ring_figure(metric_df, available_labels, metric_name) + st.plotly_chart(fig, width="stretch", key=f"distance_single_ring_{metric_name}") + with single_tabs[3]: + metric_tabs = st.tabs(METRIC_ORDER) + for metric_name, metric_tab in zip(METRIC_ORDER, metric_tabs): + with metric_tab: + metric_df = distance_view[ + (distance_view["metric"] == metric_name) + & (distance_view["label"].isin(available_labels)) + ].copy() + if metric_df.empty: + st.info(f"No data for {metric_name}.") + continue + metric_df["r"] = pd.Categorical(metric_df["r"], categories=ordered_distance_bins(metric_df["r"].tolist()), ordered=True) + for start in range(0, len(available_labels), 3): + chunk = available_labels[start : start + 3] + cols = st.columns(len(chunk)) + for col, label_name in zip(cols, chunk): + with col: + label_df = metric_df[metric_df["label"] == label_name].copy() + if label_df.empty: + st.caption(f"{label_name}: no data") + continue + fig = px.line( + label_df, + x="r", + y="value", + markers=True, + title=label_name, + labels={"r": "Radius bin", "value": "Error (m)"}, + color_discrete_sequence=[PLOTLY_COLORS["blue"]], + ) + fig.update_layout(height=280, margin=dict(l=10, r=10, t=45, b=10), showlegend=False) + st.plotly_chart(fig, width="stretch", key=f"distance_single_small_{metric_name}_{label_name}") + +section_header( + "Polar Field", + "Each tab is one metric, and every label gets its own heatmap. That keeps the page filter-free while still easy to scan.", +) +polar_view_tabs = st.tabs(["Heatmap", "Circular"]) +for view_name, outer_tab in zip(["heatmap", "circular"], polar_view_tabs): + with outer_tab: + metric_tabs = st.tabs(METRIC_ORDER) + for metric_name, metric_tab in zip(METRIC_ORDER, metric_tabs): + with metric_tab: + metric_polar = polar_delta[polar_delta["metric"] == metric_name].copy() if polar_delta is not None else polar_summary[polar_summary["metric"] == metric_name].copy() + value_col = "value_delta" if polar_delta is not None else "value" + if metric_polar.empty: + st.info(f"No data for {metric_name}.") + continue + for start in range(0, len(available_labels), 3): + chunk = available_labels[start : start + 3] + cols = st.columns(len(chunk)) + for col, label_name in zip(cols, chunk): + with col: + label_polar = metric_polar[metric_polar["label"] == label_name].copy() + if label_polar.empty: + st.caption(f"{label_name}: no data") + continue + if view_name == "heatmap": + pivot = ( + label_polar.pivot(index="r", columns="theta", values=value_col) + .reindex(index=R_LABELS, columns=THETA_LABELS) + ) + fig = go.Figure( + data=go.Heatmap( + z=pivot.values, + x=[str(v) for v in pivot.columns], + y=[str(v) for v in pivot.index], + colorscale="RdBu" if polar_delta is not None else "YlOrRd", + zmid=0 if polar_delta is not None else None, + hovertemplate=("theta=%{x}
r=%{y}
Δ=%{z:+.3f} m" if polar_delta is not None else "theta=%{x}
r=%{y}
value=%{z:.3f} m"), + ) + ) + fig.update_layout( + title=f"{label_name} (B - A)" if polar_delta is not None else label_name, + xaxis_title="Theta", + yaxis_title="Radius", + height=320, + margin=dict(l=10, r=10, t=45, b=10), + ) + st.plotly_chart(fig, width="stretch", key=f"polar_{view_name}_{metric_name}_{label_name}") + else: + fig = build_theta_ring_figure( + label_polar=label_polar, + metric_name=metric_name, + label_name=label_name, + value_col=value_col, + delta_mode=polar_delta is not None, + ) + st.plotly_chart(fig, width="stretch", key=f"polar_{view_name}_{metric_name}_{label_name}") + +section_header( + "Metric Table", + "Exact summary values for the labels in view, aligned with the specsheet future metric definitions.", +) +table_cols = ["label", "future_rows"] + METRIC_ORDER +st.dataframe( + ( + compare_label[["label"] + [f"{m}_A" for m in METRIC_ORDER] + [f"{m}_B" for m in METRIC_ORDER] + [f"{m}_delta" for m in METRIC_ORDER]] + if compare_label is not None + else (label_view[table_cols] if not label_view.empty else label_summary[table_cols]) + ), + width="stretch", + hide_index=True, + column_config={ + "future_rows": st.column_config.NumberColumn("Rows", format="%d"), + "minADE@1s": st.column_config.NumberColumn("minADE@1s", format="%.3f m"), + "minADE@3s": st.column_config.NumberColumn("minADE@3s", format="%.3f m"), + "minADE@5s": st.column_config.NumberColumn("minADE@5s", format="%.3f m"), + "minFDE@1s": st.column_config.NumberColumn("minFDE@1s", format="%.3f m"), + "minFDE@3s": st.column_config.NumberColumn("minFDE@3s", format="%.3f m"), + "minFDE@5s": st.column_config.NumberColumn("minFDE@5s", format="%.3f m"), + }, +) diff --git a/evaluation_dashboard_app/pages/13_Trend_Insights.py b/evaluation_dashboard_app/pages/13_Trend_Insights.py new file mode 100644 index 0000000..f4d5235 --- /dev/null +++ b/evaluation_dashboard_app/pages/13_Trend_Insights.py @@ -0,0 +1,2133 @@ +from __future__ import annotations + +import json +import re +import shutil +from html import escape +from pathlib import Path +from typing import Any + +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import streamlit as st +import streamlit.components.v1 as components + +from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header +from lib.path_utils import get_data_root, path_display, resolve_under_data_root +from lib.release_specsheet_library import discover_release_specsheet_inventory +from lib.specsheet_report import ( + DEFAULT_TREND_METADATA_TEXT, + TREND_METADATA_FILENAME, + TREND_SUMMARY_FILENAME, + TrendReleaseGroup, + classify_trend_summary, + discover_trend_release_groups, + extract_devops_case_rows, + extract_performance_metrics_from_summary, + load_trend_summary_file, + parse_trend_metadata_text, +) + +st.set_page_config(page_title="Trend Insights", layout="wide", initial_sidebar_state="expanded") +inject_app_page_styles() + + +def _parse_data_count(value: Any) -> int | None: + text = str(value or "").strip().replace(",", "").replace("+", "") + if not text: + return None + try: + return int(text) + except ValueError: + return None + + +def _select_primary_metadata(group: TrendReleaseGroup) -> dict[str, Any]: + for role in ("full", "usecase", "devops", "performance_blocks", "unknown"): + if role in group.jobs: + return group.jobs[role]["metadata"] + return {} + + +def _safe_path_part(value: Any, fallback: str) -> str: + text = str(value or "").strip() + text = re.sub(r"[^\w.\-]+", "_", text).strip("._") + return text or fallback + + +def _resolve_summary_json_input(user_path: str) -> tuple[Path | None, str]: + resolved, err = resolve_under_data_root(user_path, allow_missing=False) + if err: + return None, err + assert resolved is not None + if resolved.is_file(): + if resolved.name != TREND_SUMMARY_FILENAME: + return None, f"Expected a {TREND_SUMMARY_FILENAME} file: {path_display(resolved)}" + return resolved, "" + for candidate in ( + resolved / TREND_SUMMARY_FILENAME, + resolved / "resources" / TREND_SUMMARY_FILENAME, + ): + if candidate.exists(): + return candidate, "" + return None, f"No {TREND_SUMMARY_FILENAME} found in {path_display(resolved)} or its resources/ folder." + + +def _default_job_id_from_summary(summary_path: Path) -> str: + if summary_path.parent.name == "resources": + return summary_path.parent.parent.name + return summary_path.parent.name + + +def _assemble_trend_release_group( + *, + release_name: str, + topic_name: str, + role_sources: dict[str, str], + role_job_ids: dict[str, str], + metadata: dict[str, Any], +) -> Path: + data_root = get_data_root() + release_dir = data_root / _safe_path_part(release_name, "trend_release") + topic_dir = release_dir / _safe_path_part(topic_name, "perception.object_recognition.objects") + expected_roles = {"full", "usecase", "devops"} + seen_roles: dict[str, Path] = {} + + for expected_role, source_text in role_sources.items(): + summary_path, err = _resolve_summary_json_input(source_text) + if err: + raise ValueError(f"{expected_role}: {err}") + assert summary_path is not None + summary = load_trend_summary_file(summary_path) + actual_role = classify_trend_summary(summary) + if actual_role != expected_role: + raise ValueError( + f"{expected_role}: {path_display(summary_path)} classified as `{actual_role}`, " + f"not `{expected_role}`." + ) + seen_roles[actual_role] = summary_path + + missing = sorted(expected_roles - set(seen_roles)) + if missing: + raise ValueError(f"Missing required trend roles: {', '.join(missing)}") + + for role, summary_path in seen_roles.items(): + job_id = _safe_path_part(role_job_ids.get(role) or _default_job_id_from_summary(summary_path), role) + job_dir = topic_dir / job_id + job_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(summary_path, job_dir / TREND_SUMMARY_FILENAME) + with (job_dir / TREND_METADATA_FILENAME).open("w", encoding="utf-8") as fh: + import yaml + + yaml.safe_dump(metadata, fh, allow_unicode=True, sort_keys=False) + return release_dir + + +def _render_release_trend_builder() -> None: + section_header("Build Release Trend Group") + with st.expander("Assemble full/usecase/devops summaries into one release", expanded=False): + with st.form("release_trend_builder_form"): + form_col1, form_col2 = st.columns([1.1, 1.2]) + with form_col1: + release_name = st.text_input( + "Release folder name", + value="trend_release___", + ) + topic_name = st.text_input( + "Topic folder", + value="perception.object_recognition.objects", + ) + full_source = st.text_input("Full summary source") + usecase_source = st.text_input("Usecase summary source") + devops_source = st.text_input("DevOps summary source") + with form_col2: + full_job_id = st.text_input("Full job id override", value="") + usecase_job_id = st.text_input("Usecase job id override", value="") + devops_job_id = st.text_input("DevOps job id override", value="") + metadata_text = st.text_area( + "Release metadata YAML", + value=DEFAULT_TREND_METADATA_TEXT, + height=180, + help="Required keys: tags, pilot_auto_version, data_count, description, date.", + ) + submitted = st.form_submit_button("Create Release Trend Group", type="primary") + + if submitted: + try: + metadata = parse_trend_metadata_text(metadata_text) + created_dir = _assemble_trend_release_group( + release_name=release_name, + topic_name=topic_name, + role_sources={ + "full": full_source, + "usecase": usecase_source, + "devops": devops_source, + }, + role_job_ids={ + "full": full_job_id, + "usecase": usecase_job_id, + "devops": devops_job_id, + }, + metadata=metadata, + ) + st.success(f"Created release trend group at `{path_display(created_dir)}`. Refreshing inventory...") + st.rerun() + except Exception as exc: + st.error(f"Could not create release trend group: {exc}") + + +def _release_display_name(version: Any, date: Any, description: Any = "") -> str: + version_text = str(version or "").strip() or "Unknown Version" + date_text = str(date or "").strip() + description_text = str(description or "").strip() + suffix = f" | {date_text}" if date_text else "" + if description_text: + suffix += f" | {description_text}" + return f"{version_text}{suffix}" + + +def _with_pass_rate(frame: pd.DataFrame, *, passed_col: str = "passed", total_col: str = "total") -> pd.DataFrame: + enriched = frame.copy() + total = pd.to_numeric(enriched[total_col], errors="coerce") + passed = pd.to_numeric(enriched[passed_col], errors="coerce") + enriched["pass_rate"] = (passed / total.replace(0, pd.NA)) * 100.0 + return enriched + + +def _update_version_axis(fig: go.Figure, versions: list[str]) -> None: + fig.update_xaxes(categoryorder="array", categoryarray=versions) + + +def _role_overview_url(release_row: dict[str, Any], role: str) -> str: + role_info = release_row.get("roles", {}).get(role, {}) + return str(role_info.get("overview_url") or "") + + +def _role_debug_path(release_row: dict[str, Any], role: str) -> str: + role_info = release_row.get("roles", {}).get(role, {}) + return str(role_info.get("absolute_path") or "") + + +def _role_evaluator_url(release_row: dict[str, Any], role: str) -> str: + role_info = release_row.get("roles", {}).get(role, {}) + return str(role_info.get("evaluator_report_url") or "") + + +def _topic_family(topic_name: Any) -> str: + topic = str(topic_name or "") + if topic == "perception.object_recognition.objects": + return "Perception Performance" + if topic.startswith("perception.object_recognition.detection."): + return "ML Model Performance" + return "Other" + + +def _date_sort_value(value: Any) -> float: + parsed = pd.to_datetime(value, format="%Y.%m.%d", errors="coerce") + if pd.isna(parsed): + return -1.0 + return float(parsed.timestamp()) + + +def _html_link(url: str, label: str, variant: str = "action") -> str: + if not url: + return '-' + return ( + f'{escape(label)}' + ) + + +def _pdf_links_for_prefix(release: dict[str, Any], prefix: str) -> str: + links = [] + for pdf in release.get("pdfs", []): + topic = str(pdf.get("topic") or "") + if topic == prefix or topic.startswith(prefix): + label = "Prediction" + if topic.startswith("perception.object_recognition.detection."): + label = topic.replace("perception.object_recognition.detection.", "").replace(".objects", "") + label = label.replace("bevfusion", "BEVFusion").replace("centerpoint", "CenterPoint") + links.append(_html_link(str(pdf.get("static_url") or ""), label, "pdf")) + return '' + "".join(links) + "" if links else '-' + + +def _has_pdf_for_prefix(release: dict[str, Any], prefix: str) -> bool: + for pdf in release.get("pdfs", []): + topic = str(pdf.get("topic") or "") + if topic == prefix or topic.startswith(prefix): + return True + return False + + +def _render_release_library_table(releases: list[dict[str, Any]]) -> None: + group_headers = [ + ("Release", 4), + ("Overview", 3), + ("Specsheet PDF", 2), + ("Evaluator Job", 3), + ] + col_widths = [360, 96, 240, 92, 96, 96, 96, 128, 168, 96, 96, 96] + headers = [ + "Version", + "Date", + "Description", + "Data", + "Performance", + "Usecase", + "DevOps", + "Prediction", + "Detection", + "Performance", + "Usecase", + "DevOps", + ] + sort_types = ["text", "date", "text", "number", "text", "text", "text", "text", "text", "text", "text", "text"] + sortable_columns = {0, 1, 2, 3} + rows_html = [] + for release in releases: + sort_values = [ + str(release.get("version") or ""), + str(_date_sort_value(release.get("date"))), + str(release.get("description") or ""), + str(_parse_data_count(release.get("data_count")) or -1), + "open" if _role_overview_url(release, "performance") else "", + "open" if _role_overview_url(release, "usecase") else "", + "open" if _role_overview_url(release, "devops") else "", + "prediction" if _has_pdf_for_prefix(release, "perception.object_recognition.objects") else "", + "detection" if _has_pdf_for_prefix(release, "perception.object_recognition.detection.") else "", + "report" if _role_evaluator_url(release, "performance") else "", + "report" if _role_evaluator_url(release, "usecase") else "", + "report" if _role_evaluator_url(release, "devops") else "", + ] + cells = [ + escape(str(release.get("version") or "")), + escape(str(release.get("date") or "")), + escape(str(release.get("description") or "")), + escape(str(release.get("data_count") or "")), + _html_link(_role_overview_url(release, "performance"), "Open", "overview"), + _html_link(_role_overview_url(release, "usecase"), "Open", "overview"), + _html_link(_role_overview_url(release, "devops"), "Open", "overview"), + _pdf_links_for_prefix(release, "perception.object_recognition.objects"), + _pdf_links_for_prefix(release, "perception.object_recognition.detection."), + _html_link(_role_evaluator_url(release, "performance"), "Report", "job"), + _html_link(_role_evaluator_url(release, "usecase"), "Report", "job"), + _html_link(_role_evaluator_url(release, "devops"), "Report", "job"), + ] + rows_html.append( + "" + + "".join( + f'{cell}' + for cell, sort_value in zip(cells, sort_values) + ) + + "" + ) + table_html = f""" + + + + + + + +
+
+ + {''.join(f'' for width in col_widths)} + + {''.join(f'' for header, span in group_headers)} + {''.join(f'' if idx in sortable_columns else f'' for idx, header in enumerate(headers))} + + {''.join(rows_html)} +
{escape(header)}
{escape(header)}
+
+
+ + + +""" + component_height = 78 + max(1, len(releases)) * 32 + components.html(table_html, height=component_height, scrolling=False) + + +def _release_inventory_debug_rows(releases: list[dict[str, Any]]) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for release in releases: + rows.append( + { + "version": release["version"], + "date": release["date"], + "release": release["release"], + "release_dir": release["release_dir_absolute"], + "performance_dir": _role_debug_path(release, "performance"), + "usecase_dir": _role_debug_path(release, "usecase"), + "devops_dir": _role_debug_path(release, "devops"), + "performance_job_url": _role_evaluator_url(release, "performance"), + "usecase_job_url": _role_evaluator_url(release, "usecase"), + "devops_job_url": _role_evaluator_url(release, "devops"), + "pdf_paths": "\n".join(pdf["absolute_path"] for pdf in release.get("pdfs", [])), + } + ) + return rows + + +def _release_metric_bar_ranges(frame: pd.DataFrame) -> dict[str, tuple[float, float]]: + ranges: dict[str, tuple[float, float]] = {} + metric_columns = ("mAP", "precision", "recall", "overall_pass_rate", "FNR", "x_error", "y_error", "yaw_error") + for column in metric_columns: + if column not in frame.columns: + continue + values = pd.to_numeric(frame[column], errors="coerce") + if not values.notna().any(): + continue + min_value = float(values.min(skipna=True)) + max_value = float(values.max(skipna=True)) + if abs(max_value - min_value) < 1e-12: + if column == "overall_pass_rate": + min_value, max_value = 0.0, 100.0 + elif column in {"mAP", "precision", "recall"}: + min_value, max_value = 0.0, 1.0 + else: + min_value, max_value = 0.0, max(max_value, 1.0) + ranges[column] = (min_value, max_value) + return ranges + + +def _release_performance_cell_html(value: Any, column: str, ranges: dict[str, tuple[float, float]]) -> str: + metric_columns = {"mAP", "precision", "recall", "overall_pass_rate", "FNR", "x_error", "y_error", "yaw_error"} + if column not in metric_columns: + return escape(str(value or "")) + + numeric = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0] + if pd.isna(numeric): + return '-' + + min_value, max_value = ranges.get(column, (0.0, 1.0)) + span = max(max_value - min_value, 1e-12) + normalized = max(0.0, min(1.0, (float(numeric) - min_value) / span)) + pct = 8.0 + normalized * 92.0 + if column == "overall_pass_rate": + label = f"{float(numeric):.1f}%" + else: + label = f"{float(numeric):.3f}" + + # Calm app-aligned palette: soft rose for weak/concerning values, soft teal for strong/healthy values. + teal = (45, 212, 191) + rose = (251, 113, 133) + if column in {"mAP", "precision", "recall", "overall_pass_rate"}: + color_ratio = normalized + else: + color_ratio = 1.0 - normalized + red = round(rose[0] + (teal[0] - rose[0]) * color_ratio) + green = round(rose[1] + (teal[1] - rose[1]) * color_ratio) + blue = round(rose[2] + (teal[2] - rose[2]) * color_ratio) + + return ( + f'
' + f'{escape(label)}' + "
" + ) + + +def _release_performance_column_group(column: str) -> str: + if column in {"version", "date", "description", "data_count"}: + return "Release" + if column in {"mAP", "precision", "recall"}: + return "Score" + if column in {"FNR", "x_error", "y_error", "yaw_error"}: + return "Error" + if column == "overall_pass_rate": + return "Pass Rate" + return "Jobs / Metadata" + + +def _render_release_performance_html_table(frame: pd.DataFrame) -> None: + ranges = _release_metric_bar_ranges(frame) + numeric_columns = {"mAP", "precision", "recall", "overall_pass_rate", "FNR", "x_error", "y_error", "yaw_error", "data_count"} + group_spans: list[tuple[str, int]] = [] + for column in frame.columns: + group = _release_performance_column_group(str(column)) + if group_spans and group_spans[-1][0] == group: + group_spans[-1] = (group, group_spans[-1][1] + 1) + else: + group_spans.append((group, 1)) + group_header_html = "".join( + f'{escape(group)}' + for group, span in group_spans + ) + header_html = "".join( + ( + f'' + ) + for idx, column in enumerate(frame.columns) + ) + row_html = [] + for _, row in frame.iterrows(): + cells = [] + for column in frame.columns: + value = row.get(column) + if column == "data_count": + parsed_count = _parse_data_count(value) + sort_value = "" if parsed_count is None else str(parsed_count) + elif column in numeric_columns: + numeric = pd.to_numeric(pd.Series([value]), errors="coerce").iloc[0] + sort_value = "" if pd.isna(numeric) else f"{float(numeric):.12g}" + else: + sort_value = str(value or "") + cells.append( + f'' + f"{_release_performance_cell_html(value, column, ranges)}" + ) + row_html.append(f"{''.join(cells)}") + + table_html = f""" + + + + + + + +
+ + + {group_header_html} + {header_html} + + {''.join(row_html)} +
+
+ + + +""" + component_height = 76 + max(1, len(frame)) * 34 + components.html(table_html, height=component_height, scrolling=False) + + +def _release_performance_table( + frame: pd.DataFrame, + *, + family: str, + empty_message: str, + table_mode: str, +) -> None: + if frame.empty: + st.info(empty_message) + return + view = frame[frame["topic_family"] == family].copy() + if view.empty: + st.info(empty_message) + return + columns = [ + "version", + "date", + "description", + "data_count", + "mAP", + "precision", + "recall", + "FNR", + "x_error", + "y_error", + "yaw_error", + "roles", + "full_job_id", + "usecase_job_id", + "devops_job_id", + "topic_name", + ] + if family == "Perception Performance": + columns.insert(columns.index("roles"), "overall_pass_rate") + visible = [column for column in columns if column in view.columns] + display_frame = view.sort_values(["date_sort", "version", "release_name"], ascending=[False, False, False])[visible] + if table_mode == "Colored bars": + _render_release_performance_html_table(display_frame) + else: + dataframe_height = 52 + max(1, len(display_frame)) * 36 + dataframe_column_config = { + "version": st.column_config.TextColumn("version", width="large"), + "description": st.column_config.TextColumn("description", width="medium"), + "full_job_id": st.column_config.TextColumn("full_job_id", width="large"), + "usecase_job_id": st.column_config.TextColumn("usecase_job_id", width="large"), + "devops_job_id": st.column_config.TextColumn("devops_job_id", width="large"), + "topic_name": st.column_config.TextColumn("topic_name", width="large"), + } + st.dataframe( + display_frame, + width="stretch", + hide_index=True, + height=dataframe_height, + column_config={key: value for key, value in dataframe_column_config.items() if key in display_frame.columns}, + ) + + +def _build_pass_combo_chart( + frame: pd.DataFrame, + *, + title: str, + versions: list[str], + line_y_col: str = "pass_rate", + series_col: str | None = None, + scenario_count_col: str = "total", + hover_cols: list[str] | None = None, +) -> go.Figure: + fig = go.Figure() + show_legend = series_col is not None + scenario_totals = ( + frame.groupby("version", dropna=False)[scenario_count_col] + .sum() + .reindex(versions) + .fillna(0) + ) + fig.add_bar( + x=versions, + y=scenario_totals.tolist(), + name="Scenario Count", + marker_color="#bfdbfe", + opacity=0.32, + yaxis="y2", + hovertemplate="%{x}
Scenario Count: %{y:.0f}", + ) + + hover_cols = hover_cols or ["date", "release_name", "passed", "total"] + plot_df = frame.copy() + version_order = {version: idx for idx, version in enumerate(versions)} + plot_df["__version_order"] = plot_df["version"].map(version_order).fillna(len(version_order)) + plot_df = plot_df.sort_values(["__version_order", "version", "date", "release_name"]) + if series_col is None: + fig.add_trace( + go.Scatter( + x=plot_df["version"], + y=plot_df[line_y_col], + name=title, + mode="lines+markers", + line=dict(color="#1d4ed8", width=3), + marker=dict(size=8, color="#1d4ed8"), + customdata=plot_df[hover_cols].to_numpy() if hover_cols else None, + hovertemplate="%{x}
Pass Rate: %{y:.1f}%
Date: %{customdata[0]}
Release: %{customdata[1]}", + ) + ) + else: + palette = px.colors.qualitative.Bold + px.colors.qualitative.Safe + px.colors.qualitative.Set2 + for idx, series_name in enumerate(plot_df[series_col].dropna().astype(str).unique().tolist()): + series_df = plot_df[plot_df[series_col].astype(str) == series_name].sort_values( + ["__version_order", "version", "date", "release_name"] + ) + color = palette[idx % len(palette)] + fig.add_trace( + go.Scatter( + x=series_df["version"], + y=series_df[line_y_col], + name=series_name, + mode="lines+markers", + line=dict(color=color, width=3), + marker=dict(size=7, color=color), + customdata=series_df[hover_cols].to_numpy() if hover_cols else None, + hovertemplate=( + "%{x}
" + + f"{series_col.replace('_', ' ').title()}: {series_name}
" + + "Pass Rate: %{y:.1f}%
" + + "Date: %{customdata[0]}
" + + "Release: %{customdata[1]}
" + + "Passed: %{customdata[2]:.0f}
" + + "Total: %{customdata[3]:.0f}" + ), + ) + ) + + fig.update_layout( + title=title, + xaxis_title="Pilot.Auto Version", + yaxis_title="Pass Rate (%)", + yaxis2=dict(title="Scenario Count", overlaying="y", side="right", showgrid=False), + height=440, + showlegend=show_legend, + legend=dict(orientation="h", yanchor="top", y=-0.22, x=0, xanchor="left"), + margin=dict(l=20, r=20, t=80, b=90), + plot_bgcolor="#ffffff", + paper_bgcolor="#ffffff", + ) + fig.update_xaxes(showgrid=False, categoryorder="array", categoryarray=versions) + fig.update_yaxes(range=[0, 100], gridcolor="rgba(148, 163, 184, 0.18)") + return fig + + +def _build_defect_hierarchy_bars( + frame: pd.DataFrame, + *, + category_cols: list[str], + title: str, + color_col: str = "major_category", + label_cols: list[str] | None = None, + color_map: dict[str, str] | None = None, +) -> go.Figure: + bars = frame.copy() + for category_col in category_cols: + bars[category_col] = bars[category_col].fillna("Unspecified") + label_cols = label_cols or category_cols + bars["full_label"] = bars[label_cols].astype(str).agg(" / ".join, axis=1) + bars["label"] = bars["full_label"] + bars = bars.sort_values(category_cols + ["pass_rate", "total"], ascending=[True] * len(category_cols) + [False, False]) + fig = px.bar( + bars, + x="label", + y="pass_rate", + color=color_col, + color_discrete_map=color_map, + hover_data={"label": False, "full_label": True, "passed": True, "total": True}, + text=bars["pass_rate"].map(lambda value: f"{value:.1f}%" if pd.notna(value) else "n/a"), + title=title, + ) + fig.update_layout( + height=500, + margin=dict(l=20, r=20, t=70, b=140), + xaxis_title=" / ".join(label.replace("_", " ").title() for label in label_cols), + yaxis_title="Pass Rate (%)", + legend_title_text=color_col.replace("_", " ").title(), + ) + fig.update_traces(textposition="outside", cliponaxis=False) + fig.update_xaxes(tickangle=-35, automargin=True) + fig.update_yaxes(range=[0, 100], automargin=True) + return fig + + +def _build_defect_case_bars( + frame: pd.DataFrame, + *, + ordered_mid_categories: list[str], + max_cases: int = 20, +) -> go.Figure: + case_bars = frame.copy() + case_bars["minor_category"] = case_bars["minor_category"].fillna(case_bars["case_name"]) + case_bars["mid_order"] = case_bars["mid_category"].map( + {mid_category: idx for idx, mid_category in enumerate(ordered_mid_categories)} + ) + case_bars = case_bars.sort_values(["mid_order", "pass_rate", "total"], ascending=[True, True, False]) + case_bars = case_bars.head(max_cases) + fig = px.bar( + case_bars, + x="minor_category", + y="pass_rate", + color="mid_category", + hover_data=["major_category", "mid_category", "passed", "total"], + text=case_bars["pass_rate"].map(lambda value: f"{value:.1f}%" if pd.notna(value) else "n/a"), + title="Case Pass Rates", + ) + fig.update_layout( + height=500, + margin=dict(l=20, r=20, t=70, b=140), + xaxis_title="Case", + yaxis_title="Pass Rate (%)", + legend_title_text="Mid Category", + ) + fig.update_traces(textposition="outside", cliponaxis=False) + fig.update_xaxes(tickangle=-35, automargin=True, categoryorder="array", categoryarray=case_bars["minor_category"].tolist()) + fig.update_yaxes(range=[0, 100], automargin=True) + return fig + + +def _build_metric_timeline_heatmap( + frame: pd.DataFrame, + *, + value_col: str, + title: str, + color_title: str, +) -> go.Figure: + matrix = frame.pivot_table( + index="label_name", + columns="release_axis", + values=value_col, + aggfunc="first", + ).dropna(how="all") + fig = px.imshow( + matrix, + aspect="auto", + color_continuous_scale=["#7f1d1d", "#f8fafc", "#14532d"] if "delta" in value_col else ["#f8fafc", "#8dd3c7", "#0f766e"], + color_continuous_midpoint=0 if "delta" in value_col else None, + text_auto=".3f", + ) + fig.update_layout( + title=title, + margin=dict(l=20, r=20, t=70, b=20), + coloraxis_colorbar=dict(title=color_title), + ) + fig.update_xaxes(tickangle=-30, automargin=True) + fig.update_yaxes(automargin=True) + return fig + + +def _build_metric_label_lines( + frame: pd.DataFrame, + *, + title: str, + ordered_axes: list[str], +) -> go.Figure: + plot_df = frame.dropna(subset=["value"]).copy() + axis_order = {axis: idx for idx, axis in enumerate(ordered_axes)} + plot_df["__axis_order"] = plot_df["release_axis"].map(axis_order).fillna(len(axis_order)) + plot_df = plot_df.sort_values(["label_name", "__axis_order", "release_axis"]) + fig = px.line( + plot_df, + x="release_axis", + y="value", + color="label_name", + markers=True, + hover_data=["version", "date", "release_name"], + title=title, + ) + fig.update_layout(margin=dict(l=20, r=20, t=70, b=20), legend_title_text="Label") + fig.update_xaxes(categoryorder="array", categoryarray=ordered_axes, tickangle=-30, automargin=True) + fig.update_traces(connectgaps=True) + return fig + + +def _horizon_metric_sort_key(metric_name: str) -> tuple[float, str]: + horizon_text = str(metric_name).rsplit("@", 1)[-1].removesuffix("s") + try: + return float(horizon_text), str(metric_name) + except ValueError: + return float("inf"), str(metric_name) + + +def _horizon_metric_label(metric_name: str) -> str: + return str(metric_name).rsplit("@", 1)[-1] if "@" in str(metric_name) else str(metric_name) + + +def _available_prediction_metric_groups(frame: pd.DataFrame) -> dict[str, tuple[str, ...]]: + groups: dict[str, tuple[str, ...]] = {} + metric_series = frame["metric_name"].dropna().astype(str) + for metric_family in ("minADE", "minFDE"): + metric_names = sorted( + metric_series[metric_series.str.startswith(f"{metric_family}@")].unique().tolist(), + key=_horizon_metric_sort_key, + ) + if metric_names: + groups[metric_family] = tuple(metric_names) + return groups + + +def _build_prediction_label_profile( + frame: pd.DataFrame, + *, + selected_label: str, + metric_family: str, + metric_names: tuple[str, ...], + ordered_axes: list[str], +) -> go.Figure: + profile_df = frame[ + (frame["metric_name"].isin(metric_names)) + & (frame["label_name"] == selected_label) + ].dropna(subset=["value"]).copy() + axis_order = {axis: idx for idx, axis in enumerate(ordered_axes)} + profile_df["__axis_order"] = profile_df["release_axis"].map(axis_order).fillna(len(axis_order)) + profile_df = profile_df.sort_values(["metric_name", "__axis_order", "release_axis"]) + fig = px.line( + profile_df, + x="release_axis", + y="value", + color="metric_name", + markers=True, + hover_data=["version", "date", "release_name"], + title=f"{selected_label} {metric_family} Horizon Profile", + ) + fig.update_layout(margin=dict(l=20, r=20, t=70, b=20), legend_title_text="Horizon") + fig.update_xaxes(categoryorder="array", categoryarray=ordered_axes, tickangle=-30, automargin=True) + fig.update_traces(connectgaps=True) + return fig + + +def _build_prediction_release_label_profile( + frame: pd.DataFrame, + *, + metric_family: str, + selected_release_axis: str, + selected_labels: list[str], + metric_names: tuple[str, ...], +) -> go.Figure | None: + release_df = frame[ + (frame["release_axis"] == selected_release_axis) + & (frame["label_name"].isin(selected_labels)) + & (frame["metric_name"].isin(metric_names)) + ].copy() + if release_df.empty: + return None + + release_df["horizon"] = release_df["metric_name"].map(_horizon_metric_label) + release_df["horizon_sort"] = release_df["metric_name"].map(lambda name: _horizon_metric_sort_key(str(name))[0]) + release_df = release_df.sort_values(["label_name", "horizon_sort"]) + fig = px.line( + release_df, + x="horizon", + y="value", + color="label_name", + markers=True, + category_orders={"horizon": [_horizon_metric_label(metric_name) for metric_name in metric_names]}, + hover_data=["version", "date", "release_name"], + title=f"{metric_family} by Label and Horizon", + ) + fig.update_layout( + height=460, + margin=dict(l=20, r=20, t=70, b=30), + legend_title_text="Label", + xaxis_title="Prediction Horizon", + yaxis_title=f"{metric_family} (m)", + plot_bgcolor="#ffffff", + paper_bgcolor="#ffffff", + ) + fig.update_xaxes(showgrid=False) + fig.update_yaxes(gridcolor="rgba(148, 163, 184, 0.18)") + return fig + + +def _build_release_frames(groups: list[TrendReleaseGroup]) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + release_rows: list[dict[str, Any]] = [] + case_rows: list[dict[str, Any]] = [] + metric_rows: list[dict[str, Any]] = [] + + for group in groups: + primary_metadata = _select_primary_metadata(group) + version = str(primary_metadata.get("pilot_auto_version") or "") + date = str(primary_metadata.get("date") or "") + description = str(primary_metadata.get("description") or "") + data_count = str(primary_metadata.get("data_count") or "") + release_row = { + "group_key": group.group_key, + "release_name": group.display_name, + "topic_name": group.topic_name, + "group_kind": group.group_kind, + "version": version, + "date": date, + "description": description, + "data_count": data_count, + "data_count_num": _parse_data_count(data_count), + "full_job_id": group.jobs.get("full", {}).get("job_id"), + "usecase_job_id": group.jobs.get("usecase", {}).get("job_id"), + "devops_job_id": group.jobs.get("devops", {}).get("job_id"), + "mAP": None, + "precision": None, + "recall": None, + "FNR": None, + "x_error": None, + "y_error": None, + "yaw_error": None, + "speed_error": None, + "minADE@1s": None, + "minADE@3s": None, + "minADE@5s": None, + "minFDE@1s": None, + "minFDE@3s": None, + "minFDE@5s": None, + "overall_pass_rate": None, + "scenario_count": None, + "role_count": len(group.jobs), + "roles": ", ".join(sorted(group.jobs.keys())), + } + + if "full" in group.jobs: + full_summary = group.jobs["full"]["summary"] + release_row.update(extract_performance_metrics_from_summary(full_summary)) + for block in full_summary.get("blocks", []): + block_header = str(block.get("header") or "") + for table in block.get("tables", []): + table_data = table.get("data", {}) + if not isinstance(table_data, dict): + continue + for metric_name, labels in table_data.items(): + if not isinstance(labels, dict): + continue + for label_name, value in labels.items(): + metric_rows.append( + { + "group_key": group.group_key, + "release_name": group.display_name, + "version": version, + "date": date, + "description": description, + "block_header": block_header, + "metric_name": metric_name, + "label_name": label_name, + "value": pd.to_numeric(value, errors="coerce"), + } + ) + + if "devops" in group.jobs: + flattened = extract_devops_case_rows(group.jobs["devops"]["summary"]) + if flattened: + total_passed = sum(int(row["passed"]) for row in flattened) + total_count = sum(int(row["total"]) for row in flattened) + release_row["scenario_count"] = total_count + release_row["overall_pass_rate"] = (total_passed / total_count * 100.0) if total_count > 0 else None + for row in flattened: + case_rows.append( + { + "group_key": group.group_key, + "release_name": group.display_name, + "version": version, + "date": date, + "description": description, + **row, + } + ) + + release_rows.append(release_row) + + release_df = pd.DataFrame(release_rows) + if not release_df.empty: + release_df["date_sort"] = pd.to_datetime(release_df["date"], format="%Y.%m.%d", errors="coerce") + release_df["release_display"] = release_df.apply( + lambda row: _release_display_name(row["version"], row["date"], row["description"]), + axis=1, + ) + case_df = pd.DataFrame(case_rows) + if not case_df.empty: + case_df["date_sort"] = pd.to_datetime(case_df["date"], format="%Y.%m.%d", errors="coerce") + case_df["release_display"] = case_df.apply( + lambda row: _release_display_name(row["version"], row["date"], row["description"]), + axis=1, + ) + metric_df = pd.DataFrame(metric_rows) + if not metric_df.empty: + metric_df["date_sort"] = pd.to_datetime(metric_df["date"], format="%Y.%m.%d", errors="coerce") + metric_df["release_display"] = metric_df.apply( + lambda row: _release_display_name(row["version"], row["date"], row["description"]), + axis=1, + ) + return release_df, case_df, metric_df + + +render_page_hero( + kicker="Release Analytics", + title="Trend Insights", + description="Release history and performance trends.", +) + +groups = discover_trend_release_groups() +if not groups: + st.info("No saved trend metadata was found yet. Use the release trend builder below after the three job summaries are available.") + _render_release_trend_builder() + st.stop() + +try: + release_df, case_df, metric_df = _build_release_frames(groups) +except Exception as exc: + st.error(f"Could not build trend insights: {exc}") + st.stop() + +if not release_df.empty: + release_df["topic_family"] = release_df["topic_name"].map(_topic_family) + +section_header("Release History") +release_specsheets = discover_release_specsheet_inventory(get_data_root()) +if release_specsheets: + release_specsheets = sorted( + release_specsheets, + key=lambda row: ( + pd.to_datetime(row.get("date"), format="%Y.%m.%d", errors="coerce").timestamp() + if pd.notna(pd.to_datetime(row.get("date"), format="%Y.%m.%d", errors="coerce")) + else -1.0, + str(row.get("version") or ""), + str(row.get("release") or ""), + ), + reverse=True, + ) + _render_release_library_table(release_specsheets) +else: + st.info("No imported release library was found. Run `python scripts/import_catalog_analyzer_releases.py --force` to import analyzer output.") + +section_header("Release Performance") +top1, top2, top3, top4, top5 = st.columns(5) +top1.metric("Performance Groups", f"{len(release_df):,}") +top2.metric("Unique Versions", f"{release_df['version'].nunique():,}" if not release_df.empty else "0") +top3.metric("Perception Performance", f"{int((release_df['topic_family'] == 'Perception Performance').sum()):,}" if not release_df.empty else "0") +top4.metric("ML Model Performance", f"{int((release_df['topic_family'] == 'ML Model Performance').sum()):,}" if not release_df.empty else "0") +top5.metric("Latest Date", release_df.sort_values("date_sort")["date"].iloc[-1] if not release_df.empty else "n/a") + +performance_table_mode = st.segmented_control( + "Table view", + options=["Dataframe", "Colored bars"], + default="Dataframe", + key="release_performance_table_mode", +) + +st.markdown("#### Perception Performance") +_release_performance_table( + release_df, + family="Perception Performance", + empty_message="No Perception Performance release rows are available.", + table_mode=performance_table_mode, +) + +st.markdown("#### ML Model Performance") +_release_performance_table( + release_df, + family="ML Model Performance", + empty_message="No ML Model Performance release rows are available.", + table_mode=performance_table_mode, +) + +section_header("Major Performance Scores") + +perf_entries = release_df[release_df["full_job_id"].notna()].sort_values( + ["date_sort", "version", "release_name"], + ascending=[True, True, True], +) +major_metric_cols = ["mAP", "precision", "recall"] +prediction_cols = [ + "minADE@1s", + "minADE@3s", + "minADE@5s", + "minFDE@1s", + "minFDE@3s", + "minFDE@5s", +] +if not perf_entries.empty and perf_entries[major_metric_cols].notna().any().any(): + latest_major_rows = ( + perf_entries.dropna(subset=major_metric_cols, how="all") + .sort_values(["date_sort", "version", "release_name"]) + .groupby("topic_family", dropna=False) + .tail(1) + ) + metric_card_cols = st.columns(4) + for family, card_col in zip(("Perception Performance", "ML Model Performance"), metric_card_cols[:2]): + family_row = latest_major_rows[latest_major_rows["topic_family"] == family] + if family_row.empty: + card_col.metric(f"{family} mAP", "n/a") + continue + card_col.metric( + f"{family} mAP", + f"{family_row['mAP'].iloc[-1]:.3f}" if pd.notna(family_row["mAP"].iloc[-1]) else "n/a", + ) + latest_perception_row = latest_major_rows[latest_major_rows["topic_family"] == "Perception Performance"] + latest_model_row = latest_major_rows[latest_major_rows["topic_family"] == "ML Model Performance"] + metric_card_cols[2].metric( + "Perception Recall", + f"{latest_perception_row['recall'].iloc[-1]:.3f}" + if not latest_perception_row.empty and pd.notna(latest_perception_row["recall"].iloc[-1]) + else "n/a", + ) + metric_card_cols[3].metric( + "ML Model Recall", + f"{latest_model_row['recall'].iloc[-1]:.3f}" + if not latest_model_row.empty and pd.notna(latest_model_row["recall"].iloc[-1]) + else "n/a", + ) + fig = go.Figure() + scenario_totals = ( + perf_entries[perf_entries["topic_family"] == "Perception Performance"] + .groupby("version", dropna=False)["data_count_num"] + .max() + .reindex(perf_entries["version"].drop_duplicates().tolist()) + ) + fig.add_bar( + x=scenario_totals.index.tolist(), + y=scenario_totals.tolist(), + name="Data Count", + marker_color="#f4a7a7", + opacity=0.28, + yaxis="y2", + hovertemplate="%{x}
Data Count: %{y:,}", + ) + metric_styles = { + "mAP": "#0f766e", + "precision": "#1d4ed8", + "recall": "#be123c", + } + family_dashes = { + "Perception Performance": "solid", + "ML Model Performance": "dot", + } + for family in ("Perception Performance", "ML Model Performance"): + family_df = perf_entries[perf_entries["topic_family"] == family].copy() + if family_df.empty: + continue + for metric_col in major_metric_cols: + metric_df_for_line = family_df.dropna(subset=[metric_col]) + if metric_df_for_line.empty: + continue + fig.add_trace( + go.Scatter( + x=metric_df_for_line["version"], + y=metric_df_for_line[metric_col], + name=metric_col, + legendgroup=family, + legendgrouptitle_text=family, + mode="lines+markers", + line=dict( + color=metric_styles[metric_col], + width=3, + dash=family_dashes.get(family, "solid"), + ), + marker=dict(size=7), + customdata=metric_df_for_line[["release_name", "date", "data_count", "topic_name"]].to_numpy(), + hovertemplate=( + "%{x}
" + + f"{family} {metric_col}" + + ": %{y:.3f}
Release: %{customdata[0]}
Date: %{customdata[1]}
Data Count: %{customdata[2]}
Topic: %{customdata[3]}" + ), + ) + ) + fig.update_layout( + title="Major Performance Scores", + xaxis_title="Pilot.Auto Version", + yaxis_title="Score", + yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), + height=520, + legend=dict(orientation="h", yanchor="top", y=-0.18, x=0, xanchor="left"), + legend_tracegroupgap=18, + margin=dict(l=20, r=20, t=80, b=125), + ) + st.plotly_chart(fig, use_container_width=True) +else: + st.info("No grouped major metric trend entries are available yet.") + +section_header("Prediction Trend") + +prediction_entries = perf_entries[perf_entries["topic_family"] == "Perception Performance"].copy() +prediction_entries = prediction_entries.sort_values(["date_sort", "version", "release_name"], ascending=[True, True, True]) + +if not prediction_entries.empty and prediction_entries[prediction_cols].notna().any().any(): + pred_card_col1, pred_card_col2, pred_card_col3 = st.columns(3) + latest_pred_row = prediction_entries.dropna(subset=prediction_cols, how="all").iloc[-1] + latest_minade_mean = pd.to_numeric(latest_pred_row[["minADE@1s", "minADE@3s", "minADE@5s"]], errors="coerce").mean() + latest_minfde_mean = pd.to_numeric(latest_pred_row[["minFDE@1s", "minFDE@3s", "minFDE@5s"]], errors="coerce").mean() + pred_card_col1.metric( + "Mean minADE", + f"{latest_minade_mean:.2f} m" if pd.notna(latest_minade_mean) else "n/a", + ) + pred_card_col2.metric( + "Mean minFDE", + f"{latest_minfde_mean:.2f} m" if pd.notna(latest_minfde_mean) else "n/a", + ) + pred_card_col3.metric( + "Latest Data Count", + f"{int(latest_pred_row['data_count_num']):,}" if pd.notna(latest_pred_row["data_count_num"]) else "n/a", + ) + pred_story = prediction_entries[ + ["version", "date", "description", "release_name", "data_count", "data_count_num"] + prediction_cols + ].copy() + pred_fig = go.Figure() + pred_fig.add_bar( + x=pred_story["version"], + y=pred_story["data_count_num"], + name="Data Count", + marker_color="#fbbf24", + opacity=0.20, + yaxis="y2", + hovertemplate="%{x}
Data Count: %{y:,}", + ) + series_specs = [ + ("minADE@1s", "#0f766e", "solid"), + ("minADE@3s", "#14b8a6", "solid"), + ("minADE@5s", "#99f6e4", "solid"), + ("minFDE@1s", "#1d4ed8", "dot"), + ("minFDE@3s", "#60a5fa", "dot"), + ("minFDE@5s", "#bfdbfe", "dot"), + ] + for metric_name, color, dash in series_specs: + metric_story = pred_story.dropna(subset=[metric_name]) + if metric_story.empty: + continue + pred_fig.add_trace( + go.Scatter( + x=metric_story["version"], + y=metric_story[metric_name], + name=metric_name, + mode="lines+markers", + line=dict(color=color, width=3 if metric_name.endswith("@3s") else 2, dash=dash), + marker=dict(size=8), + customdata=metric_story[["date", "release_name", "data_count"]].to_numpy(), + hovertemplate=( + "%{x}
" + + metric_name + + ": %{y:.2f} m
Date: %{customdata[0]}
Release: %{customdata[1]}
Data Count: %{customdata[2]}" + ), + ) + ) + pred_fig.update_layout( + title="Prediction Error Trend", + xaxis_title="Pilot.Auto Version", + yaxis_title="Prediction Error (m)", + yaxis2=dict(title="Data Count", overlaying="y", side="right", showgrid=False), + height=480, + legend=dict(orientation="h", yanchor="top", y=-0.18, x=0, xanchor="left"), + margin=dict(l=20, r=20, t=80, b=105), + plot_bgcolor="#ffffff", + paper_bgcolor="#ffffff", + ) + pred_fig.update_xaxes(showgrid=False) + pred_fig.update_yaxes(gridcolor="rgba(148, 163, 184, 0.18)") + st.plotly_chart(pred_fig, use_container_width=True) +else: + st.info("No usable grouped prediction trend values are available yet.") + +atlas_df = pd.DataFrame() +release_manifest = pd.DataFrame() +ordered_release_axes: list[str] = [] + +if not metric_df.empty: + atlas_df = metric_df[metric_df["block_header"] == "全数データセット評価"].copy() + atlas_df = atlas_df.sort_values(["date_sort", "version", "release_name"], ascending=[True, True, True]) + atlas_df["release_axis"] = atlas_df["version"].astype(str) + " | " + atlas_df["date"].astype(str) + release_manifest = ( + atlas_df[["group_key", "release_axis", "version", "date", "release_name", "release_display"]] + .drop_duplicates() + .reset_index(drop=True) + ) + ordered_release_axes = release_manifest["release_axis"].tolist() + +section_header("Pass Rate Trend") + +pass_entries = release_df[release_df["devops_job_id"].notna()].sort_values( + ["date_sort", "version", "release_name"], + ascending=[True, True, True], +) +if not pass_entries.empty: + pass_entries = pass_entries.copy() + pass_entries["pass_axis"] = pass_entries["version"].astype(str) + " | " + pass_entries["date"].astype(str) +ordered_versions = pass_entries["pass_axis"].drop_duplicates().tolist() if not pass_entries.empty else [] +overall_plot_df = pd.DataFrame() +major_summary = pd.DataFrame() +mid_summary = pd.DataFrame() + +if not pass_entries.empty and pass_entries["overall_pass_rate"].notna().any(): + overall_plot_df = pass_entries[ + ["pass_axis", "date", "release_name", "overall_pass_rate", "scenario_count"] + ].rename(columns={"overall_pass_rate": "pass_rate", "scenario_count": "total"}).copy() + overall_plot_df = overall_plot_df.rename(columns={"pass_axis": "version"}) + +if not case_df.empty: + case_for_pass = case_df.copy() + case_for_pass["pass_axis"] = case_for_pass["version"].astype(str) + " | " + case_for_pass["date"].astype(str) + major_summary = ( + case_for_pass.groupby(["pass_axis", "date", "release_name", "major_category"], dropna=False)[["passed", "total"]] + .sum() + .reset_index() + .rename(columns={"pass_axis": "version"}) + ) + major_summary = _with_pass_rate(major_summary) + + mid_summary = ( + case_for_pass.groupby( + ["pass_axis", "date", "release_name", "major_category", "mid_category"], + dropna=False, + )[["passed", "total"]] + .sum() + .reset_index() + .rename(columns={"pass_axis": "version"}) + ) + mid_summary = _with_pass_rate(mid_summary) + +if not overall_plot_df.empty: + st.plotly_chart( + _build_pass_combo_chart( + overall_plot_df, + title="Overall Pass Rate", + versions=ordered_versions, + series_col=None, + hover_cols=["date", "release_name"], + ), + use_container_width=True, + ) +else: + st.info("No grouped pass-rate summaries are available yet.") + +if not major_summary.empty: + st.plotly_chart( + _build_pass_combo_chart( + major_summary, + title="Major Category Pass Rate", + versions=ordered_versions, + series_col="major_category", + ), + use_container_width=True, + ) + +if not mid_summary.empty: + mid_summary_all = mid_summary.drop(columns=["major_category"], errors="ignore") + st.plotly_chart( + _build_pass_combo_chart( + mid_summary_all, + title="Mid Category Pass Rate", + versions=ordered_versions, + series_col="mid_category", + ), + use_container_width=True, + ) + +section_header("Defect Evaluation") + +if not case_df.empty and not pass_entries.empty: + defect_release_options = pass_entries["release_display"].tolist() + selected_defect_release = st.selectbox( + "Version", + defect_release_options, + index=len(defect_release_options) - 1, + key="defect_evaluation_release", + ) + selected_defect_row = pass_entries.iloc[defect_release_options.index(selected_defect_release)] + selected_defect_case_df = case_df[case_df["group_key"] == selected_defect_row["group_key"]].copy() + defect_category_cols = ["major_category", "mid_category", "minor_category"] + selected_major_mid = ( + selected_defect_case_df.groupby(defect_category_cols, dropna=False)[["passed", "total"]] + .sum() + .reset_index() + ) + selected_major_mid = _with_pass_rate(selected_major_mid) + if not selected_major_mid.empty: + latest_view_mode = st.radio( + "View", + ["Bars", "Treemap", "Icicle", "Sunburst"], + horizontal=True, + ) + if latest_view_mode == "Bars": + mid_level = ( + selected_defect_case_df.groupby(["major_category", "mid_category"], dropna=False)[["passed", "total"]] + .sum() + .reset_index() + ) + mid_level = _with_pass_rate(mid_level) + mid_level = mid_level.sort_values( + ["major_category", "mid_category", "pass_rate", "total"], + ascending=[True, True, False, False], + ) + ordered_mid_categories = mid_level["mid_category"].tolist() + st.plotly_chart( + _build_defect_hierarchy_bars( + mid_level, + category_cols=["major_category", "mid_category"], + color_col="major_category", + title="Major / Mid", + ), + use_container_width=True, + ) + st.plotly_chart( + _build_defect_case_bars( + selected_defect_case_df, + ordered_mid_categories=ordered_mid_categories, + ), + use_container_width=True, + ) + elif latest_view_mode == "Treemap": + latest_fig = px.treemap( + selected_major_mid, + path=defect_category_cols, + values="total", + color="pass_rate", + color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], + range_color=(0, 100), + ) + latest_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) + st.plotly_chart(latest_fig, use_container_width=True) + elif latest_view_mode == "Icicle": + latest_fig = px.icicle( + selected_major_mid, + path=defect_category_cols, + values="total", + color="pass_rate", + color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], + range_color=(0, 100), + ) + latest_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) + st.plotly_chart(latest_fig, use_container_width=True) + else: + latest_fig = px.sunburst( + selected_major_mid, + path=defect_category_cols, + values="total", + color="pass_rate", + color_continuous_scale=["#7f1d1d", "#fef3c7", "#166534"], + range_color=(0, 100), + ) + latest_fig.update_layout(margin=dict(l=20, r=20, t=70, b=20)) + st.plotly_chart(latest_fig, use_container_width=True) + + case_pass_rate = selected_defect_case_df.copy() + case_pass_rate["case"] = case_pass_rate["minor_category"].fillna(case_pass_rate["case_name"]) + case_pass_rate = case_pass_rate.sort_values(["pass_rate", "total"], ascending=[True, False]) + with st.expander("Case Pass Rates", expanded=False): + st.dataframe( + case_pass_rate[ + ["major_category", "mid_category", "case", "pass_rate", "passed", "total"] + ], + use_container_width=True, + hide_index=True, + column_config={ + "pass_rate": st.column_config.NumberColumn("pass_rate", format="%.1f%%"), + }, + ) + else: + st.info("No defect evaluation hierarchy is available yet.") +else: + st.info("No defect evaluation summaries are available yet.") + +if not atlas_df.empty: + release_options = release_manifest["release_axis"].tolist() + section_header("Release Details") + selected_detail_release = st.selectbox( + "Version", + release_options, + index=len(release_options) - 1, + key="deep_dive_release_detail", + ) + horizon_metric_groups = _available_prediction_metric_groups(atlas_df) + available_horizon_families = [metric_family for metric_family in ("minADE", "minFDE") if metric_family in horizon_metric_groups] + horizon_labels = sorted( + atlas_df[ + atlas_df["metric_name"].isin( + [metric_name for metric_names in horizon_metric_groups.values() for metric_name in metric_names] + ) + ]["label_name"] + .dropna() + .astype(str) + .unique() + .tolist() + ) + + selected_atlas_group_key = release_manifest.loc[ + release_manifest["release_axis"] == selected_detail_release, + "group_key", + ].iloc[0] + latest_matrix = atlas_df[atlas_df["group_key"] == selected_atlas_group_key].pivot_table( + index="metric_name", + columns="label_name", + values="value", + aggfunc="first", + ).dropna(how="all") + if not latest_matrix.empty: + latest_min = latest_matrix.min(axis=1) + latest_range = (latest_matrix.max(axis=1) - latest_min).replace(0, 1) + latest_norm = latest_matrix.sub(latest_min, axis=0).div(latest_range, axis=0) + latest_atlas_fig = px.imshow( + latest_norm, + aspect="auto", + color_continuous_scale=["#f8fafc", "#8dd3c7", "#0f766e"], + text_auto=".2f", + ) + latest_atlas_fig.update_traces( + text=latest_matrix.round(2).astype(str), + hovertemplate="Metric: %{y}
Label: %{x}
Value: %{text}", + ) + latest_atlas_fig.update_layout( + title="Metric Atlas", + margin=dict(l=20, r=20, t=70, b=20), + coloraxis_colorbar=dict(title="Relative"), + ) + latest_atlas_fig.update_xaxes(automargin=True) + latest_atlas_fig.update_yaxes(automargin=True) + st.plotly_chart(latest_atlas_fig, use_container_width=True) + else: + st.info("No metric atlas is available for the selected release yet.") + + if available_horizon_families and horizon_labels: + release_detail_cols = st.columns(len(available_horizon_families)) + for col, metric_family in zip(release_detail_cols, available_horizon_families): + metric_names = horizon_metric_groups[metric_family] + family_df = atlas_df[atlas_df["metric_name"].isin(metric_names)].copy() + release_fig = _build_prediction_release_label_profile( + family_df, + metric_family=metric_family, + selected_release_axis=selected_detail_release, + selected_labels=horizon_labels, + metric_names=metric_names, + ) + with col: + if release_fig is not None: + st.plotly_chart(release_fig, use_container_width=True) + else: + st.info(f"No {metric_family} horizon values are available for the selected release.") + + section_header("Trend Details") + if available_horizon_families and horizon_labels: + selected_horizon_label = st.selectbox( + "Label Trend Focus", + horizon_labels, + key="prediction_horizon_label_focus", + ) + trend_profile_cols = st.columns(len(available_horizon_families)) + for col, metric_family in zip(trend_profile_cols, available_horizon_families): + metric_names = horizon_metric_groups[metric_family] + family_df = atlas_df[atlas_df["metric_name"].isin(metric_names)].copy() + profile_fig = _build_prediction_label_profile( + family_df, + selected_label=selected_horizon_label, + metric_family=metric_family, + metric_names=metric_names, + ordered_axes=ordered_release_axes, + ) + with col: + st.plotly_chart(profile_fig, use_container_width=True) + else: + st.info("No minADE/minFDE horizon trend data is available yet.") + + trend_mode = st.radio( + "Trend View", + ["Timeline Heatmap", "Label Trend Lines"], + horizontal=True, + key="detailed_metric_trend_view", + ) + + metric_options = sorted(atlas_df["metric_name"].dropna().unique().tolist()) + selected_metric = st.selectbox("Metric", metric_options, key="detailed_metric_trend_metric") + + metric_trend_df = atlas_df[atlas_df["metric_name"] == selected_metric].copy() + if not metric_trend_df.empty: + if trend_mode == "Timeline Heatmap": + explorer_fig = _build_metric_timeline_heatmap( + metric_trend_df, + value_col="value", + title=f"{selected_metric} Timeline Heatmap by Label", + color_title=selected_metric, + ) + else: + explorer_fig = _build_metric_label_lines( + metric_trend_df, + title=f"{selected_metric} Label Trend Lines", + ordered_axes=ordered_release_axes, + ) + st.plotly_chart(explorer_fig, use_container_width=True) + else: + st.info("No detailed trend data is available for the selected metric yet.") +elif not metric_df.empty: + st.info("No full-dataset metric atlas data is available yet.") + +if not case_df.empty: + with st.expander("Case Explorer", expanded=False): + filter_col1, filter_col2, filter_col3, filter_col4 = st.columns(4) + with filter_col1: + selected_major = st.selectbox("Major Category", ["All"] + sorted(case_df["major_category"].dropna().unique().tolist())) + case_filtered = case_df.copy() + if selected_major != "All": + case_filtered = case_filtered[case_filtered["major_category"] == selected_major] + with filter_col2: + selected_mid = st.selectbox("Mid Category", ["All"] + sorted(case_filtered["mid_category"].dropna().unique().tolist())) + if selected_mid != "All": + case_filtered = case_filtered[case_filtered["mid_category"] == selected_mid] + with filter_col3: + selected_minor = st.selectbox("Minor Category", ["All"] + sorted(case_filtered["minor_category"].dropna().unique().tolist())) + if selected_minor != "All": + case_filtered = case_filtered[case_filtered["minor_category"] == selected_minor] + with filter_col4: + selected_case = st.selectbox("Case", ["All"] + sorted(case_filtered["case_name"].dropna().unique().tolist())) + if selected_case != "All": + case_filtered = case_filtered[case_filtered["case_name"] == selected_case] + + st.dataframe( + case_filtered.sort_values(["date_sort", "version", "case_name"]).drop(columns=["date_sort"], errors="ignore"), + use_container_width=True, + hide_index=True, + ) + +with st.expander("Grouped Raw Browser", expanded=False): + selection_df = release_df.sort_values( + ["date_sort", "version", "release_name"], + ascending=[False, False, False], + ).reset_index(drop=True) + selection_labels = [ + f"{row.release_display} | roles: {row.roles}" + for row in selection_df.itertuples() + ] + selected_label = st.selectbox("Release Group", selection_labels) + selected_release = selection_df.iloc[selection_labels.index(selected_label)] + selected_group = next(group for group in groups if group.group_key == selected_release["group_key"]) + + group_manifest = { + "display_name": selected_group.display_name, + "topic_name": selected_group.topic_name, + "group_kind": selected_group.group_kind, + "base_dir": str(selected_group.base_dir), + "jobs": { + role: { + "job_id": payload["job_id"], + "metadata_path": str(payload["metadata_path"]), + "summary_path": str(payload["summary_path"]), + } + for role, payload in selected_group.jobs.items() + }, + } + + detail_col1, detail_col2 = st.columns([0.9, 1.1]) + with detail_col1: + st.markdown("**Release Group Manifest**") + st.code(json.dumps(group_manifest, ensure_ascii=False, indent=2), language="json") + role_choice = st.selectbox("Child Role", sorted(selected_group.jobs.keys())) + + with detail_col2: + st.markdown("**Selected Child Summary JSON**") + st.code( + json.dumps(selected_group.jobs[role_choice]["summary"], ensure_ascii=False, indent=2)[:30000], + language="json", + ) + +_render_release_trend_builder() + +if release_specsheets: + with st.expander("Debug release inventory paths", expanded=False): + st.dataframe( + pd.DataFrame(_release_inventory_debug_rows(release_specsheets)), + width="stretch", + hide_index=True, + ) diff --git a/evaluation_dashboard_app/pages/1_TP_Summary.py b/evaluation_dashboard_app/pages/1_TP_Summary.py index c08e7a1..1425fca 100644 --- a/evaluation_dashboard_app/pages/1_TP_Summary.py +++ b/evaluation_dashboard_app/pages/1_TP_Summary.py @@ -2,15 +2,20 @@ import plotly.express as px import pandas as pd from lib.path_utils import path_display +from lib.overview_url_hydrate import try_hydrate_session_from_overview_query_params from lib.page_chrome import inject_app_page_styles, render_loaded_data_section, render_page_hero, section_header from lib.summary_compare import build_summary_delta st.set_page_config(layout="wide", page_title="TP Summary", page_icon="📈", initial_sidebar_state="expanded") +try_hydrate_session_from_overview_query_params() inject_app_page_styles() # ========== Safety Check ========== if "runA" not in st.session_state: - st.warning("Please load data from the Overview page first.") + st.warning( + "Please load data from the Overview page first. " + "If you already did, open Overview once so the URL includes `run_a=...`, then return (multiple Streamlit replicas)." + ) st.stop() mode = st.session_state.get("mode", "Single Run") @@ -71,6 +76,18 @@ mode=mode, ) + +def _apply_compact_chart_layout(fig, *, height: int = 300) -> None: + """Keep TP Summary charts visually lighter and more compact.""" + fig.update_layout( + template="plotly_white", + height=height, + margin=dict(t=48, b=40, l=48, r=18), + paper_bgcolor="rgba(248,250,252,0.9)", + plot_bgcolor="rgba(255,255,255,0.95)", + font=dict(family="system-ui, sans-serif", size=12, color="#334155"), + ) + # ========== View Selector ========== st.sidebar.markdown("##### Scope") if mode == "Compare Mode" and all_runs and run_labels and delta_by_label: @@ -120,6 +137,18 @@ if tp_col not in df_active.columns: st.warning(f"Missing required column: {tp_col}") st.stop() + +if df_active.empty: + if use_delta: + _keys = "id and perception_label" if "perception_label" in df_active.columns else "id" + st.warning( + f"No delta rows: baseline and candidate share no common Summary keys ({_keys}). " + "Pick Baseline or Candidate in the sidebar, or load runs with overlapping rows." + ) + else: + st.warning("The active Summary has no rows for this view.") + st.stop() + tp_values = df_active[tp_col] tp_min_val = float(tp_values.min()) tp_max_val = float(tp_values.max()) @@ -153,7 +182,7 @@ # ========== Data Filtering ========== df_f = df_active[(df_active[tp_col] >= tp_min) & (df_active[tp_col] <= tp_max)].copy() -if clip_vel: +if clip_vel and not df_f.empty: vx_col = "vx_delta" if use_delta else "vx" vy_col = "vy_delta" if use_delta else "vy" for c in (vx_col, vy_col): @@ -194,7 +223,8 @@ section_header("Position RMS (X vs Y)", "Lateral vs longitudinal RMS error; color encodes TP or ΔTP.") # Always compare the two sources side by side (before and after/delta) if use_delta: - # Show both reference and target RMS comparisons for X and Y, as well as their deltas + # Show both reference and target RMS comparisons in a tighter 2-up row. + rms_left, rms_right = st.columns(2) fig_rms_x_compare = px.scatter( df_f, x="xrms_B", @@ -208,11 +238,14 @@ "xrms_delta": "Δ X RMS", "yrms_delta": "Δ Y RMS", }, - title=f"Scatter: X RMS ({cand}) vs X RMS (A)", + title=f"X RMS · {cand} vs A", color_continuous_scale="Viridis", ) - fig_rms_x_compare.update_traces(marker=dict(size=8, opacity=0.6)) - st.plotly_chart(fig_rms_x_compare, width="stretch") + fig_rms_x_compare.update_traces(marker=dict(size=7, opacity=0.58)) + _apply_compact_chart_layout(fig_rms_x_compare, height=290) + with rms_left: + st.plotly_chart(fig_rms_x_compare, width="stretch") + fig_rms_y_compare = px.scatter( df_f, x="yrms_B", @@ -226,11 +259,13 @@ "xrms_delta": "Δ X RMS", "yrms_delta": "Δ Y RMS", }, - title=f"Scatter: Y RMS ({cand}) vs Y RMS (A)", + title=f"Y RMS · {cand} vs A", color_continuous_scale="Viridis", ) - fig_rms_y_compare.update_traces(marker=dict(size=8, opacity=0.6)) - st.plotly_chart(fig_rms_y_compare, width="stretch") + fig_rms_y_compare.update_traces(marker=dict(size=7, opacity=0.58)) + _apply_compact_chart_layout(fig_rms_y_compare, height=290) + with rms_right: + st.plotly_chart(fig_rms_y_compare, width="stretch") else: # Just show the submission's RMS (x/y) for standard analysis fig_rms = px.scatter( @@ -246,13 +281,14 @@ }, color_continuous_scale="Viridis", ) - fig_rms.update_traces(marker=dict(size=8, opacity=0.7)) + fig_rms.update_traces(marker=dict(size=8, opacity=0.68)) + _apply_compact_chart_layout(fig_rms, height=320) st.plotly_chart(fig_rms, width="stretch") with col2: section_header("Velocity (vx vs vy)", "Planar velocity colored by TP or ΔTP.") - def plot_velocity(df, vx, vy, vx_label, vy_label): + def plot_velocity(df, vx, vy, vx_label, vy_label, *, title: str): fig = px.scatter( df, x=vx, @@ -265,18 +301,32 @@ def plot_velocity(df, vx, vy, vx_label, vy_label): tp_col: "TP", }, color_continuous_scale="Plasma", - title=f"{vx_label} vs {vy_label}", + title=title, ) - st.plotly_chart(fig, width="stretch") + fig.update_traces(marker=dict(size=7, opacity=0.58)) + _apply_compact_chart_layout(fig, height=290 if use_delta else 320) + return fig if use_delta: - plot_velocity(df_f, "vx", "vy", "Vx (A)", "Vy (A)") - plot_velocity(df_f, "vx_B", "vy_B", f"Vx ({cand})", f"Vy ({cand})") + vel_left, vel_right = st.columns(2) + with vel_left: + st.plotly_chart( + plot_velocity(df_f, "vx", "vy", "Vx (A)", "Vy (A)", title="Velocity · A"), + width="stretch", + ) + with vel_right: + st.plotly_chart( + plot_velocity(df_f, "vx_B", "vy_B", f"Vx ({cand})", f"Vy ({cand})", title=f"Velocity · {cand}"), + width="stretch", + ) else: - plot_velocity(df_f, "vx", "vy", "Vx", "Vy") + st.plotly_chart( + plot_velocity(df_f, "vx", "vy", "Vx", "Vy", title="Velocity"), + width="stretch", + ) # ========== Metric Distribution ========== -section_header("Metric distribution", "Histogram + marginal box for any Summary column or delta column.") +section_header("Metric distribution", "Compact secondary views for a selected Summary metric.") metrics = ["xstd", "ystd", "xrms", "yrms", "vx", "vy", "TP"] metrics_delta = [f"{m}_delta" for m in metrics] metric_options = metrics_delta if use_delta else metrics @@ -287,46 +337,42 @@ def plot_velocity(df, vx, vy, vx_label, vy_label): default_index = 0 metric = st.selectbox("Select metric", metric_options, index=default_index) -# Show a simple, single-color (monochrome) distribution for clarity +dist_left, dist_right = st.columns(2) + fig_hist = px.histogram( df_f, x=metric, - nbins=40, + nbins=36, color_discrete_sequence=["#0d9488"], marginal="box", opacity=0.88, + title=f"{metric} distribution", ) fig_hist.update_layout( - template="plotly_white", showlegend=False, bargap=0.04, xaxis_title=metric, yaxis_title="Count", - paper_bgcolor="rgba(248,250,252,0.9)", - plot_bgcolor="rgba(255,255,255,0.95)", - font=dict(family="system-ui, sans-serif", size=12, color="#334155"), - margin=dict(t=36, b=48, l=56, r=28), ) -st.plotly_chart(fig_hist, width="stretch") +_apply_compact_chart_layout(fig_hist, height=280) +with dist_left: + st.plotly_chart(fig_hist, width="stretch") -section_header("Density (violin)", "Shape of the selected metric including outliers.") fig_density = px.violin( df_f, y=metric, box=True, - points="all", + points="outliers", color_discrete_sequence=["#312e81"], + title=f"{metric} density", ) fig_density.update_layout( - template="plotly_white", yaxis_title=metric, showlegend=False, - paper_bgcolor="rgba(248,250,252,0.9)", - plot_bgcolor="rgba(255,255,255,0.95)", - font=dict(family="system-ui, sans-serif", size=12, color="#334155"), - margin=dict(t=36, b=48, l=56, r=28), ) -st.plotly_chart(fig_density, width="stretch") +_apply_compact_chart_layout(fig_density, height=280) +with dist_right: + st.plotly_chart(fig_density, width="stretch") # ========== Scenario-level Delta Analysis (Compare Mode) ========== df_cmp = df_active if use_delta else None diff --git a/evaluation_dashboard_app/pages/2_Criteria_Based_Score.py b/evaluation_dashboard_app/pages/2_Criteria_Based_Score.py index 96d13e3..5229f46 100644 --- a/evaluation_dashboard_app/pages/2_Criteria_Based_Score.py +++ b/evaluation_dashboard_app/pages/2_Criteria_Based_Score.py @@ -4,6 +4,7 @@ import plotly.express as px import plotly.graph_objects as go from lib.path_utils import path_display +from lib.overview_url_hydrate import try_hydrate_session_from_overview_query_params from lib.page_chrome import ( inject_app_page_styles, render_loaded_data_section, @@ -24,7 +25,15 @@ export_gate_result, failing_scenarios_table, gate_summary, - infer_criteria_count, +) +from lib.score_schema import ( + SCORE_BLOCK_SIZE, + SCORE_NUM_COLS, + SCORE_VIEW_METRIC_COLS, + build_score_view, + infer_score_criteria_count, + score_base_cols, + score_identity_cols, ) st.set_page_config( @@ -33,7 +42,7 @@ page_icon="📊", initial_sidebar_state="expanded", ) - +try_hydrate_session_from_overview_query_params() # Plotly theme (multi-run palette aligned with Overview / run cards) _COMPARE_RUN_COLORS = ["#312e81", "#0f766e", "#e86a33", "#6b8e23", "#9b59b6", "#1abc9c"] @@ -111,7 +120,15 @@ def _filter_df_view_by_perception_labels( allowed = set(s["id"].unique()) if not allowed: return df_view.iloc[0:0].copy() - return df_view.loc[df_view["Scenario"].astype(str).isin(allowed)].copy() + + scenario_key = df_view["Scenario"].astype(str) + mask = scenario_key.isin(allowed) + if "Dataset" in df_view.columns: + # Older generated Score.csv files stored the final scenario suffix in Dataset, + # while Summary.csv kept the full id. Keep matching those files too. + composite_key = scenario_key + "_" + df_view["Dataset"].astype(str) + mask = mask | composite_key.isin(allowed) + return df_view.loc[mask].copy() def _filter_df_view_by_scenarios(df_view: pd.DataFrame, selected_scenarios: list) -> pd.DataFrame: @@ -143,7 +160,10 @@ def _apply_gate_data_filters( # Safety check # ========================= if "runA" not in st.session_state: - st.warning("Please load data from the Overview page first.") + st.warning( + "Please load data from the Overview page first. " + "If you already did, open Overview once so the URL includes `run_a=...`, then return (multiple Streamlit replicas)." + ) st.stop() mode = st.session_state.get("mode", "Single Run") @@ -189,54 +209,14 @@ def _apply_gate_data_filters( # Constants # ========================= -BASE_COLS = ["Scenario", "Option", "GT_OBJ"] - -CRITERIA_COLS = [ - "distance", - "nm", - "tp_tn", - "add", - "ail", - "uil", - "pfn_pfp", - "uuid_num", - "pass_rate", - "max_dist_thresh", - "obj_cnts", -] - -BLOCK_COLS = [ - "distance", - "nm", - "tp_tn", - "add", - "ail", - "uil", - "pfn_pfp", - "uuid_num", - "pass_rate", - "max_dist_thresh", - "obj_cnts", -] - -BLOCK_SIZE = len(CRITERIA_COLS) - -NUM_COLS = [ - "distance", - "nm", - "tp_tn", - "add", - "ail", - "uil", - "pfn_pfp", - "uuid_num", - "pass_rate", - "max_dist_thresh", -] - -_criteria_n_a = infer_criteria_count(df_raw_A, BLOCK_SIZE) +BASE_COLS = score_base_cols(df_raw_A) +CRITERIA_COLS = SCORE_VIEW_METRIC_COLS +BLOCK_SIZE = SCORE_BLOCK_SIZE +NUM_COLS = SCORE_NUM_COLS + +_criteria_n_a = infer_score_criteria_count(df_raw_A) if mode == "Compare Mode" and compare_runs: - CRITERIA_COUNT = min(infer_criteria_count(r["score"], BLOCK_SIZE) for r in compare_runs) + CRITERIA_COUNT = min(infer_score_criteria_count(r["score"]) for r in compare_runs) else: CRITERIA_COUNT = _criteria_n_a @@ -251,19 +231,16 @@ def _apply_gate_data_filters( def build_view(df_raw, criteria_idx): - start = 3 + criteria_idx * BLOCK_SIZE - end = start + BLOCK_SIZE - - df_view = df_raw.iloc[:, :3].copy() - df_view.columns = BASE_COLS + return build_score_view(df_raw, criteria_idx) - block = df_raw.iloc[:, start:end].copy() - block.columns = BLOCK_COLS - df_view = pd.concat([df_view, block], axis=1) - for c in NUM_COLS: - df_view[c] = pd.to_numeric(df_view[c], errors="coerce") - return df_view +def _add_scenario_display(df: pd.DataFrame) -> pd.DataFrame: + d = df.copy() + if "Dataset" in d.columns: + d["ScenarioDisplay"] = d["Scenario"].astype(str) + " [" + d["Dataset"].astype(str) + "]" + else: + d["ScenarioDisplay"] = d["Scenario"].astype(str) + return d st.sidebar.divider() @@ -453,13 +430,14 @@ def _gate_compare_overlap_stats(result_a: pd.DataFrame, result_b: pd.DataFrame) """Classify scenarios on inner join (same Scenario id in both gate tables).""" if result_a is None or result_b is None or result_a.empty or result_b.empty: return None - a = result_a[["Scenario", "scenario_pass"]].copy() - b = result_b[["Scenario", "scenario_pass"]].copy() + key_cols = [c for c in score_identity_cols(result_a) if c in result_b.columns] + a = result_a[key_cols + ["scenario_pass"]].copy() + b = result_b[key_cols + ["scenario_pass"]].copy() a["pass_a"] = a["scenario_pass"].map(bool) b["pass_b"] = b["scenario_pass"].map(bool) outer = a.drop(columns=["scenario_pass"]).merge( b.drop(columns=["scenario_pass"]), - on="Scenario", + on=key_cols, how="outer", indicator=True, ) @@ -500,7 +478,10 @@ def _overlap_scenario_lists(merged: pd.DataFrame) -> dict[str, list[str]]: "a_fail_b_pass": [], "a_pass_b_fail": [], } - scen = merged["Scenario"].astype(str) + if "Dataset" in merged.columns: + scen = merged["Scenario"].astype(str) + " [" + merged["Dataset"].astype(str) + "]" + else: + scen = merged["Scenario"].astype(str) pa = merged["pass_a"].map(bool) pb = merged["pass_b"].map(bool) return { @@ -1085,14 +1066,16 @@ def _render_absolute_gates_section( "Per-scenario pass rate", "Scenarios present in every run (inner join) — filter to focus on regressions or wins.", ) + scenario_key_cols = score_identity_cols(df_views[0]) merges = [] for i, lbl in enumerate(cl): - g = df_views[i].groupby("Scenario", as_index=False)["pass_rate"].mean() + g = df_views[i].groupby(scenario_key_cols, as_index=False)["pass_rate"].mean() g = g.rename(columns={"pass_rate": f"pr_{lbl}"}) merges.append(g) per_scenario = merges[0] for g in merges[1:]: - per_scenario = per_scenario.merge(g, on="Scenario", how="inner") + per_scenario = per_scenario.merge(g, on=scenario_key_cols, how="inner") + per_scenario = _add_scenario_display(per_scenario) pr_base = f"pr_{cl[0]}" delta_col = f"delta_{focus_cand}" for lbl in cand_only: @@ -1134,7 +1117,7 @@ def _render_absolute_gates_section( elif filter_method == "Custom contains string": search = st.text_input("Show scenarios with name containing (case-insensitive):", "") per_scenario_vis = ( - per_scenario[per_scenario["Scenario"].str.contains(search, case=False, na=False)] + per_scenario[per_scenario["ScenarioDisplay"].str.contains(search, case=False, na=False)] if search else per_scenario ) @@ -1145,7 +1128,7 @@ def _render_absolute_gates_section( col_to_run = {f"pr_{lbl}": run_names[i] for i, lbl in enumerate(cl)} per_scenario_vis_long = pd.melt( per_scenario_vis, - id_vars=["Scenario"], + id_vars=scenario_key_cols + ["ScenarioDisplay"], value_vars=pr_cols_melt, var_name="_k", value_name="pass_rate", @@ -1155,7 +1138,7 @@ def _render_absolute_gates_section( fig = px.bar( per_scenario_vis_long, - x="Scenario", + x="ScenarioDisplay", y="pass_rate", color="Run", color_discrete_map=_px_map, @@ -1171,7 +1154,7 @@ def _render_absolute_gates_section( ) fig2 = px.bar( per_scenario_vis.reindex(per_scenario_vis[delta_col].abs().sort_values(ascending=False).index), - x="Scenario", + x="ScenarioDisplay", y=delta_col, color=delta_col, color_continuous_scale="RdYlGn", @@ -1180,7 +1163,7 @@ def _render_absolute_gates_section( _plotly_apply_theme(fig2, "Pass rate delta by scenario") st.plotly_chart(fig2, width="stretch") - table_cols = ["Scenario"] + pr_cols_melt + [f"delta_{lbl}" for lbl in cand_only] + table_cols = scenario_key_cols + pr_cols_melt + [f"delta_{lbl}" for lbl in cand_only] table_cols = [c for c in table_cols if c in per_scenario_vis.columns] with st.expander("Show Table: Per Scenario Pass Rates and Deltas"): st.dataframe(per_scenario_vis[table_cols], width="stretch") @@ -1195,7 +1178,7 @@ def _render_absolute_gates_section( per_scenario_vis, x=pr_base, y=f"pr_{focus_cand}", - text="Scenario", + text="ScenarioDisplay", labels={ pr_base: f"Baseline ({cl[0]}) Pass Rate", f"pr_{focus_cand}": f"Candidate ({focus_cand}) Pass Rate", @@ -1376,7 +1359,8 @@ def _render_absolute_gates_section( st.plotly_chart(fig, width="stretch") section_header("Scenario leaderboard", "Mean pass rate per scenario — tune N and sort direction.") - scenario_metric = df_view.groupby("Scenario", as_index=False)["pass_rate"].mean() + scenario_key_cols = score_identity_cols(df_view) + scenario_metric = df_view.groupby(scenario_key_cols, as_index=False)["pass_rate"].mean() top_n = st.number_input("Top N scenarios", min_value=5, max_value=100, value=20, key="single_top_n") sort_order = st.radio("Order", ["Highest first", "Lowest first"], horizontal=True, key="single_scen_order") scenario_metric = scenario_metric.sort_values( diff --git a/evaluation_dashboard_app/pages/3_Detection_Stats.py b/evaluation_dashboard_app/pages/3_Detection_Stats.py index c10fcc7..480203a 100644 --- a/evaluation_dashboard_app/pages/3_Detection_Stats.py +++ b/evaluation_dashboard_app/pages/3_Detection_Stats.py @@ -1,5 +1,6 @@ import html from contextlib import contextmanager +import hashlib import duckdb import streamlit as st @@ -12,6 +13,15 @@ from typing import Optional, List, Tuple from lib.path_utils import path_display +from lib.detection_stats_debug import ( + ds_debug_init_session_state, + ds_debug_log_exception, + ds_debug_log_memory, + ds_debug_render_expander, + ds_dlog, + ds_dtimer, +) +from lib.overview_url_hydrate import try_hydrate_session_from_overview_query_params from lib.parquet_schema import schema_flags from lib.page_chrome import inject_app_page_styles, render_loaded_data_section, render_page_hero from lib.ui.detection_stats import ( @@ -73,6 +83,16 @@ ) +def _banner_html_with_note(note: str) -> str: + base = detection_stats_page_loading_banner_markup() + if not note: + return base + return base.replace( + 'Hang tight — large Parquet files can take a moment.', + f'Hang tight — large Parquet files can take a moment.
{html.escape(note)}
', + ) + + def apply_chart_theme(fig, **overrides): """Apply unified theme to a Plotly figure; overrides (e.g. height, margin) take precedence.""" layout_update = {**PLOTLY_LAYOUT_THEME, **overrides} @@ -281,11 +301,18 @@ def _scalar_metric_spider_compare( initial_sidebar_state="expanded", ) +try_hydrate_session_from_overview_query_params() +ds_debug_init_session_state(st.session_state) + # ============================= # Session state from Overview (mode, run paths) # ============================= if "runA" not in st.session_state: - st.warning("Please load data from the **Overview** page first (select mode and run(s)).") + st.warning( + "Please load data from the **Overview** page first (select mode and run(s)). " + "If you already did, open **Overview** once so the URL includes `run_a=...` (share link), then return — " + "or hard-refresh. With multiple Streamlit replicas, the server-side session may not follow until the URL is synced." + ) st.stop() inject_app_page_styles() @@ -316,16 +343,24 @@ def list_parquets_in_run(run_path) -> List[str]: return sorted([str(f.resolve()) for f in p.glob("*.parquet")]) # ============================= -# DuckDB Connection +# DuckDB Connection (one in-memory DB per Streamlit browser session) # ============================= -_duckdb_connection: Optional[duckdb.DuckDBPyConnection] = None - def get_duckdb_connection() -> duckdb.DuckDBPyConnection: - """Return a shared DuckDB connection for all queries.""" - global _duckdb_connection - if _duckdb_connection is None: - _duckdb_connection = duckdb.connect() - return _duckdb_connection + """Return a DuckDB connection scoped to this Streamlit session.""" + if "_ds_duckdb" not in st.session_state: + st.session_state["_ds_duckdb"] = duckdb.connect() + return st.session_state["_ds_duckdb"] + + +def _parquet_selection_fingerprint(paths: List[str]) -> Tuple[Tuple[str, float], ...]: + """Path + mtime per file so filter-only reruns skip rebuilding views when data is unchanged.""" + fp: List[Tuple[str, float]] = [] + for p in paths: + try: + fp.append((p, os.path.getmtime(p))) + except OSError: + fp.append((p, 0.0)) + return tuple(fp) # ============================= # Helper Functions @@ -359,12 +394,27 @@ def list_values(con, pq: str, expr: str, where: Optional[str] = None) -> List: return [] return df_.iloc[:, 0].dropna().tolist() + +def _is_detection_stats_eval_flat_cache(path: str) -> bool: + p = Path(path) + return p.suffix == ".parquet" and p.name.endswith("_eval_flat.parquet") + + def create_view_eval_flat(con, target_file: str, view_name: str = "view_eval_flat"): """Create view_eval_flat with distance bins.""" - query = f""" - CREATE OR REPLACE VIEW {view_name} AS + safe_target = target_file.replace("'", "''") + if _is_detection_stats_eval_flat_cache(target_file): + query = f"CREATE OR REPLACE VIEW {view_name} AS SELECT * FROM parquet_scan('{safe_target}')" + else: + query = f"CREATE OR REPLACE VIEW {view_name} AS {eval_flat_select_sql(target_file)}" + con.execute(query) + + +def eval_flat_select_sql(target_file: str) -> str: + safe_target = target_file.replace("'", "''") + return f""" WITH src AS ( - SELECT * FROM parquet_scan('{target_file}') + SELECT * FROM parquet_scan('{safe_target}') UNION BY NAME SELECT CAST(NULL AS VARCHAR) AS visibility, CAST(NULL AS VARCHAR) AS suite_name, @@ -411,14 +461,46 @@ def create_view_eval_flat(con, target_file: str, view_name: str = "view_eval_fla JOIN bins b ON bse.dist_h >= b.bin_start AND bse.dist_h < b.bin_end """ - con.execute(query) -def create_view_tpr_fpr(con, view_name: str = "view_tpr_fpr_by_class_dist_topic", source_eval_flat: str = "view_eval_flat"): - """Create TPR/FPR view. source_eval_flat is the name of the eval_flat view to read from.""" - query = f""" - CREATE OR REPLACE VIEW {view_name} AS - WITH stats AS ( - SELECT + +def _ds_cache_dir_for_run(run_path: Path) -> Path: + return run_path / ".dashboard_cache" / "detection_stats_cache" + + +def _ds_cache_key_for_source(source_path: str) -> str: + return hashlib.sha1(source_path.encode("utf-8")).hexdigest()[:12] + + +def _ds_cache_path_for_source(run_path: Path, source_path: str) -> Path: + src = Path(source_path) + return _ds_cache_dir_for_run(run_path) / f"{src.stem}_{_ds_cache_key_for_source(source_path)}_eval_flat.parquet" + + +def _ensure_detection_stats_eval_flat_cache( + con: duckdb.DuckDBPyConnection, + *, + run_path: Path, + source_path: str, +) -> tuple[str, bool]: + """ + Ensure a materialized eval_flat parquet exists for this source parquet. + Returns (cached_parquet_path, rebuilt_flag). + """ + cache_dir = _ds_cache_dir_for_run(run_path) + cache_dir.mkdir(parents=True, exist_ok=True) + cache_path = _ds_cache_path_for_source(run_path, source_path) + source_stat = Path(source_path).stat() + needs_rebuild = ( + not cache_path.exists() + or cache_path.stat().st_mtime < source_stat.st_mtime + ) + if needs_rebuild: + safe_out = str(cache_path).replace("'", "''") + con.execute(f"COPY ({eval_flat_select_sql(source_path)}) TO '{safe_out}' (FORMAT PARQUET)") + return str(cache_path), needs_rebuild + +# Per-(dataset, topic, label, bin, visibility, suite) aggregates — shared by distance-bin rate queries. +_TPR_FPR_STATS_SELECT = """SELECT t4dataset_id, topic_name, label, @@ -429,55 +511,110 @@ def create_view_tpr_fpr(con, view_name: str = "view_tpr_fpr_by_class_dist_topic" COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) AS gt_total, COUNT(*) FILTER (WHERE source='GT' AND status='TP') AS tp_gt, COUNT(*) FILTER (WHERE source='EST' AND status IN ('TP','FP')) AS est_total, - COUNT(*) FILTER (WHERE source='EST' AND status='FP') AS fp_est - FROM {source_eval_flat} - GROUP BY - t4dataset_id, topic_name, label, distance_bin, bin_idx, + COUNT(*) FILTER (WHERE source='EST' AND status='FP') AS fp_est""" + +_TPR_FPR_STATS_GROUP_BY = """t4dataset_id, topic_name, label, distance_bin, bin_idx, coalesce(try(CAST(visibility AS VARCHAR)), 'not available'), - coalesce(try(CAST(suite_name AS VARCHAR)), '') - ) - SELECT - *, - CASE WHEN gt_total > 0 THEN CAST(tp_gt AS DOUBLE) / gt_total ELSE NULL END AS tpr, - CASE WHEN est_total > 0 THEN CAST(fp_est AS DOUBLE) / est_total ELSE NULL END AS fpr - FROM stats + coalesce(try(CAST(suite_name AS VARCHAR)), '')""" + + +def sql_distance_bin_rates_from_eval_flat( + source_eval_flat: str, + filter_clause: str, + *, + metrics: str = "both", +) -> str: + """TPR/FPR by ``distance_bin`` from ``view_eval_flat`` rows, with filters pushed into the stats CTE. + + Distance charts used to ``SELECT ... FROM view_tpr_fpr_* WHERE ...`` (nested view over parquet). On some + DuckDB builds that plan can **SIGSEGV** the process (container exit **139**). This query inlines the same + stats aggregation and applies ``WHERE`` on the flat view instead. """ - con.execute(query) + order_by = "ORDER BY CAST(REPLACE(SPLIT_PART(distance_bin, ',', 1), '[', ' ') AS INTEGER)" + inner = f""" + WITH stats AS ( + {_TPR_FPR_STATS_SELECT} + FROM {source_eval_flat} + WHERE ({filter_clause}) + GROUP BY + {_TPR_FPR_STATS_GROUP_BY} + )""" + if metrics == "both": + return f""" + {inner} + SELECT + distance_bin, + CASE WHEN SUM(gt_total) > 0 THEN CAST(SUM(tp_gt) AS DOUBLE) / SUM(gt_total) ELSE 0 END AS tpr, + CASE WHEN SUM(est_total) > 0 THEN CAST(SUM(fp_est) AS DOUBLE) / SUM(est_total) ELSE 0 END AS fpr + FROM stats + GROUP BY distance_bin + {order_by} + """ + if metrics == "tpr": + return f""" + {inner} + SELECT distance_bin, + CASE WHEN SUM(gt_total) > 0 THEN CAST(SUM(tp_gt) AS DOUBLE) / SUM(gt_total) ELSE 0 END AS tpr + FROM stats + GROUP BY distance_bin + {order_by} + """ + if metrics == "fpr": + return f""" + {inner} + SELECT distance_bin, + CASE WHEN SUM(est_total) > 0 THEN CAST(SUM(fp_est) AS DOUBLE) / SUM(est_total) ELSE 0 END AS fpr + FROM stats + GROUP BY distance_bin + {order_by} + """ + raise ValueError(f"metrics must be 'both', 'tpr', or 'fpr', got {metrics!r}") + def build_filter_clause(filters: dict,*, enable_dist_h: bool = True) -> str: - """Build WHERE clause from filters.""" + """Build WHERE clause from filters. + + For label / suites / visibility: ``None`` means this dimension is inactive (e.g. no suite column). + An empty list ``[]`` means no restriction on that dimension (same as all options selected). + Using ``if filters.get('label')`` would treat ``[]`` as falsy and accidentally drop the filter, + causing full scans (very slow on large Parquet). + """ conditions = [] if filters.get('topic_name') and filters['topic_name'] != '__all__': conditions.append(f"topic_name = '{filters['topic_name']}'") - if filters.get('label'): - if isinstance(filters['label'], list) and len(filters['label']) > 0: - # Escape single quotes in labels - labels_escaped = [str(l).replace("'", "''") for l in filters['label']] - labels_str = "', '".join(labels_escaped) - conditions.append(f"label IN ('{labels_str}')") - elif not isinstance(filters['label'], list) and filters['label'] != '__all__': - label_escaped = str(filters['label']).replace("'", "''") + lbl = filters.get('label') + if lbl is not None: + if isinstance(lbl, list): + if len(lbl) > 0: + labels_escaped = [str(l).replace("'", "''") for l in lbl] + labels_str = "', '".join(labels_escaped) + conditions.append(f"label IN ('{labels_str}')") + elif not isinstance(lbl, list) and lbl != '__all__': + label_escaped = str(lbl).replace("'", "''") conditions.append(f"label = '{label_escaped}'") - if filters.get('suites'): - if isinstance(filters['suites'], list) and len(filters['suites']) > 0: - suite_escaped = [str(s).replace("'", "''") for s in filters['suites']] - suite_str = "', '".join(suite_escaped) - conditions.append(f"COALESCE(CAST(suite_name AS VARCHAR), '') IN ('{suite_str}')") - elif not isinstance(filters['suites'], list) and filters['suites'] != '__all__': - s_escaped = str(filters['suites']).replace("'", "''") + su = filters.get('suites') + if su is not None: + if isinstance(su, list): + if len(su) > 0: + suite_escaped = [str(s).replace("'", "''") for s in su] + suite_str = "', '".join(suite_escaped) + conditions.append(f"COALESCE(CAST(suite_name AS VARCHAR), '') IN ('{suite_str}')") + elif not isinstance(su, list) and su != '__all__': + s_escaped = str(su).replace("'", "''") conditions.append(f"COALESCE(CAST(suite_name AS VARCHAR), '') = '{s_escaped}'") - if filters.get('visibility'): - if isinstance(filters['visibility'], list) and len(filters['visibility']) > 0: - # Escape single quotes in visibility values - vis_escaped = [str(v).replace("'", "''") for v in filters['visibility']] - vis_str = "', '".join(vis_escaped) - conditions.append(f"COALESCE(visibility, 'not available') IN ('{vis_str}')") - elif not isinstance(filters['visibility'], list): - vis_escaped = str(filters['visibility']).replace("'", "''") + vis = filters.get('visibility') + if vis is not None: + if isinstance(vis, list): + if len(vis) > 0: + vis_escaped = [str(v).replace("'", "''") for v in vis] + vis_str = "', '".join(vis_escaped) + conditions.append(f"COALESCE(visibility, 'not available') IN ('{vis_str}')") + elif not isinstance(vis, list): + vis_escaped = str(vis).replace("'", "''") conditions.append(f"COALESCE(visibility, 'not available') = '{vis_escaped}'") if enable_dist_h and filters.get('max_eval_range'): @@ -542,1438 +679,1741 @@ def build_filter_clause(filters: dict,*, enable_dist_h: bool = True) -> str: target_files.append(tf) con = get_duckdb_connection() -for i, (path, lbl) in enumerate(zip(target_files, run_labels_list)): - ok, msg = validate_parquet_file(con, path) - if not ok: - st.sidebar.error(f"**Run ({lbl}) file** cannot be read: {msg}") - st.stop() +fp = _parquet_selection_fingerprint(target_files) +cache_hit = st.session_state.get("_ds_parquet_fp") == fp and "_ds_filter_opts" in st.session_state +selected_run_paths = [Path(r["path"]) for r in runs] +cached_target_files = list(target_files) +cache_rebuild_notes: List[str] = [] + +ds_dlog( + "duckdb setup: fp=%s cache_hit=%s n_runs=%s target_files=%s", + fp, + cache_hit, + len(target_files), + [os.path.basename(p) for p in target_files], +) +ds_debug_log_memory("before_duckdb_validate_views") + +with ds_dtimer("duckdb_validate_views_list_values_or_cache", st.session_state): + if not cache_hit: + for i, (path, lbl) in enumerate(zip(target_files, run_labels_list)): + ok, msg = validate_parquet_file(con, path) + if not ok: + st.sidebar.error(f"**Run ({lbl}) file** cannot be read: {msg}") + st.stop() + + # Automatically materialize eval_flat cache parquet(s) under each run. + for i, (path, run_path, lbl) in enumerate(zip(target_files, selected_run_paths, run_labels_list)): + cached_path, rebuilt = _ensure_detection_stats_eval_flat_cache( + con, + run_path=run_path, + source_path=path, + ) + cached_target_files[i] = cached_path + if rebuilt: + cache_rebuild_notes.append(f"Run {lbl}: refreshed detection cache from {os.path.basename(path)}") -# Create one eval_flat + tpr_fpr view per run (view_eval_flat_1, view_tpr_fpr_1, ...) -try: - for i, path in enumerate(target_files): - v_flat = "view_eval_flat" if i == 0 else f"view_eval_flat_{i}" - v_tpr = "view_tpr_fpr_by_class_dist_topic" if i == 0 else f"view_tpr_fpr_{i}" - create_view_eval_flat(con, path, v_flat) - create_view_tpr_fpr(con, v_tpr, source_eval_flat=v_flat) -except Exception as e: - st.error(f"Error creating views: {e}") - st.stop() + # One eval_flat view per run. (TPR/FPR layered views are not created: Distance queries inline the same + # stats from eval_flat — nested view + aggregate can segfault DuckDB, exit 139.) + try: + for i, path in enumerate(cached_target_files): + v_flat = "view_eval_flat" if i == 0 else f"view_eval_flat_{i}" + create_view_eval_flat(con, path, v_flat) + except Exception as e: + st.error(f"Error creating views: {e}") + st.stop() + + # Filter options from first file (applied to all runs) + target_file = cached_target_files[0] + topics = list_values(con, target_file, "topic_name") + labels = list_values(con, target_file, "label") + try: + suite_options = list_values(con, target_file, "COALESCE(CAST(suite_name AS VARCHAR), '')") + except Exception: + suite_options = [] + vis_options = list_values(con, target_file, "COALESCE(CAST(visibility AS VARCHAR), 'not available') AS visibility") + schema = schema_flags(con, target_file) + st.session_state["_ds_parquet_fp"] = fp + st.session_state["_ds_filter_opts"] = { + "topics": topics, + "labels": labels, + "suite_options": suite_options, + "vis_options": vis_options, + "schema": schema, + "cached_target_files": list(cached_target_files), + "cache_rebuild_notes": list(cache_rebuild_notes), + } + else: + opts = st.session_state["_ds_filter_opts"] + topics = opts["topics"] + labels = opts["labels"] + suite_options = opts["suite_options"] + vis_options = opts["vis_options"] + schema = opts["schema"] + target_file = target_files[0] + cached_target_files = opts.get("cached_target_files", list(target_files)) + cache_rebuild_notes = opts.get("cache_rebuild_notes", []) + for i, path in enumerate(cached_target_files): + v_flat = "view_eval_flat" if i == 0 else f"view_eval_flat_{i}" + create_view_eval_flat(con, path, v_flat) + +ds_debug_log_memory("after_duckdb_validate_views") -# Filter options from first file (applied to all runs) -target_file = target_files[0] with st.sidebar: - topics = list_values(con, target_file, "topic_name") topic_name = st.selectbox("Topic Name", ["__all__"] + topics, key="topic_name") if topics else "__all__" - labels = list_values(con, target_file, "label") - selected_labels = st.multiselect("Label(s)", labels, default=labels[:5] if labels and len(labels) > 5 else (labels or []), key="labels") - try: - suite_options = list_values(con, target_file, "COALESCE(CAST(suite_name AS VARCHAR), '')") - except Exception: - suite_options = [] - selected_suites = st.multiselect("Suites", suite_options, default=suite_options, key="suites", help="Filter by suite(s). Default: all included.") if suite_options else [] - vis_options = list_values(con, target_file, "COALESCE(CAST(visibility AS VARCHAR), 'not available') AS visibility") - selected_visibility = st.multiselect("Visibility", vis_options, default=vis_options, key="visibility") if vis_options else [] + # Widget keys: avoid generic "labels"/"visibility" (session_state collisions, ambiguous with run_labels). + if "ds_filter_class_labels" not in st.session_state and "labels" in st.session_state: + st.session_state["ds_filter_class_labels"] = st.session_state["labels"] + if "ds_filter_visibility" not in st.session_state and "visibility" in st.session_state: + st.session_state["ds_filter_visibility"] = st.session_state["visibility"] + if labels: + if "ds_filter_class_labels" not in st.session_state: + st.session_state["ds_filter_class_labels"] = list(labels) + selected_labels = st.multiselect( + "Label(s)", + labels, + key="ds_filter_class_labels", + ) + else: + selected_labels = [] + if suite_options: + if "suites" not in st.session_state: + st.session_state["suites"] = list(suite_options) + selected_suites = st.multiselect( + "Suites", + suite_options, + key="suites", + help="Filter by suite(s). Default: all included.", + ) + else: + selected_suites = [] + if vis_options: + if "ds_filter_visibility" not in st.session_state: + st.session_state["ds_filter_visibility"] = list(vis_options) + selected_visibility = st.multiselect( + "Visibility", + vis_options, + key="ds_filter_visibility", + ) + else: + selected_visibility = [] max_eval_range = st.selectbox("Max Evaluation Range [m]", [50, 80, 100, 120, 150], index=0, key="max_eval_range") -# Build filters (same values for all runs) +# Build filters (same values for all runs). None = dimension unused (no suite/visibility column in UI). filters_base = { 'topic_name': topic_name, 'label': selected_labels, - 'suites': selected_suites, - 'visibility': selected_visibility, + 'suites': selected_suites if suite_options else None, + 'visibility': selected_visibility if vis_options else None, 'max_eval_range': max_eval_range } filters_list = [filters_base] * len(runs) -# Schema flags for optional columns (confidence, velocity, etc.) -schema = schema_flags(con, target_file) +try: + _fcl_preview = build_filter_clause(filters_base) +except Exception as _e_fcl: + _fcl_preview = f"" +ds_dlog("filters_base keys=%s filter_clause_preview=%s", list(filters_base.keys()), _fcl_preview[:800]) -# Banner while the rest of the page (queries + charts) streams in — cleared at end of script. +# Banner while the rest of the page (queries + charts) streams in — cleared in finally (even on errors). _ds_loading_banner = st.empty() -_ds_loading_banner.markdown(detection_stats_page_loading_banner_markup(), unsafe_allow_html=True) - -# ============================= -# Main Content -# ============================= - -# ----------------------------- -# KPI strip (TP, FP, FN, TPR, FPR, Precision, Recall, F1) -# ----------------------------- -def _flat_view(i: int) -> str: - return "view_eval_flat" if i == 0 else f"view_eval_flat_{i}" - -def _kpi_row_for_view(con, view: str, filter_clause: str): - """Return dict with tp_gt, fn, tp_est, fp and derived TPR, FPR, Precision, Recall, F1.""" - q = f""" - SELECT - COUNT(*) FILTER (WHERE source = 'GT' AND status = 'TP') AS tp_gt, - COUNT(*) FILTER (WHERE source = 'GT' AND status = 'FN') AS fn, - COUNT(*) FILTER (WHERE source = 'EST' AND status = 'TP') AS tp_est, - COUNT(*) FILTER (WHERE source = 'EST' AND status = 'FP') AS fp - FROM {view} - WHERE {filter_clause} - """ - row = con.execute(q).fetchone() - if not row: - return None - tp_gt, fn, tp_est, fp = int(row[0]), int(row[1]), int(row[2]), int(row[3]) - gt_total = tp_gt + fn - est_total = tp_est + fp - tpr = (tp_gt / gt_total) if gt_total > 0 else None - fpr = (fp / est_total) if est_total > 0 else None - precision = (tp_est / est_total) if est_total > 0 else None - recall = tpr - if precision is not None and recall is not None and (precision + recall) > 0: - f1 = 2 * precision * recall / (precision + recall) - else: - f1 = None - return { - "tp": tp_gt, "fp": fp, "fn": fn, - "tpr": tpr, "fpr": fpr, "precision": precision, "recall": recall, "f1": f1, - } +_cache_note = " ".join(cache_rebuild_notes) +_ds_loading_banner.markdown(_banner_html_with_note(_cache_note), unsafe_allow_html=True) +try: + ds_dlog("main_content_try_enter") + ds_debug_log_memory("main_content_start") -# ============================= -# Panel 1: t4dataset Summary -# ============================= -st.markdown(section_header_html("Summary", "Within selected filters and max evaluation range."), unsafe_allow_html=True) -if single_mode: - with ds_spot_loading("Summary · KPI metrics"): - fc = build_filter_clause(filters_base) - kpi = _kpi_row_for_view(con, "view_eval_flat", fc) - inject_detection_stats_kpi_styles() - if kpi: - html = '
' + render_kpi_card("Metrics (within filters & max range)", kpi) + "
" - st.markdown(html, unsafe_allow_html=True) + # ============================= + # Main Content + # ============================= + + # ----------------------------- + # KPI strip (TP, FP, FN, TPR, FPR, Precision, Recall, F1) + # ----------------------------- + def _flat_view(i: int) -> str: + return "view_eval_flat" if i == 0 else f"view_eval_flat_{i}" + + def _kpi_row_for_view(con, view: str, filter_clause: str): + """Return dict with tp_gt, fn, tp_est, fp and derived TPR, FPR, Precision, Recall, F1.""" + q = f""" + SELECT + COUNT(*) FILTER (WHERE source = 'GT' AND status = 'TP') AS tp_gt, + COUNT(*) FILTER (WHERE source = 'GT' AND status = 'FN') AS fn, + COUNT(*) FILTER (WHERE source = 'EST' AND status = 'TP') AS tp_est, + COUNT(*) FILTER (WHERE source = 'EST' AND status = 'FP') AS fp + FROM {view} + WHERE {filter_clause} + """ + row = con.execute(q).fetchone() + if not row: + return None + tp_gt, fn, tp_est, fp = int(row[0]), int(row[1]), int(row[2]), int(row[3]) + gt_total = tp_gt + fn + est_total = tp_est + fp + tpr = (tp_gt / gt_total) if gt_total > 0 else None + fpr = (fp / est_total) if est_total > 0 else None + precision = (tp_est / est_total) if est_total > 0 else None + recall = tpr + if precision is not None and recall is not None and (precision + recall) > 0: + f1 = 2 * precision * recall / (precision + recall) + else: + f1 = None + return { + "tp": tp_gt, "fp": fp, "fn": fn, + "tpr": tpr, "fpr": fpr, "precision": precision, "recall": recall, "f1": f1, + } + + # ============================= + # Panel 1: t4dataset Summary + # ============================= + ds_dlog("section: Panel1_Summary_start") + st.markdown(section_header_html("Summary", "Within selected filters and max evaluation range."), unsafe_allow_html=True) + if single_mode: + with ds_spot_loading("Summary · KPI metrics"): + fc = build_filter_clause(filters_base) + kpi = _kpi_row_for_view(con, "view_eval_flat", fc) + inject_detection_stats_kpi_styles() + if kpi: + html = '
' + render_kpi_card("Metrics (within filters & max range)", kpi) + "
" + st.markdown(html, unsafe_allow_html=True) + else: + st.caption("No KPI data.") else: - st.caption("No KPI data.") -else: - with ds_spot_loading("Summary · KPI metrics"): - kpis = [] - for i in range(len(runs)): - fc = build_filter_clause(filters_list[i]) - kpi = _kpi_row_for_view(con, _flat_view(i), fc) - kpis.append((run_labels_list[i], kpi)) - inject_detection_stats_kpi_styles() - baseline = kpis[0][1] if kpis else None - cards_html_parts = [] - for lbl, kpi in kpis: - deltas = None - if baseline and kpi and lbl != run_labels_list[0]: - deltas = { - "tp": kpi["tp"] - baseline["tp"], - "fp": kpi["fp"] - baseline["fp"], - "fn": kpi["fn"] - baseline["fn"], - "tpr": (kpi["tpr"] - baseline["tpr"]) if (kpi.get("tpr") is not None and baseline.get("tpr") is not None) else None, - "fpr": (kpi["fpr"] - baseline["fpr"]) if (kpi.get("fpr") is not None and baseline.get("fpr") is not None) else None, - "precision": (kpi["precision"] - baseline["precision"]) if (kpi.get("precision") is not None and baseline.get("precision") is not None) else None, - "recall": (kpi["recall"] - baseline["recall"]) if (kpi.get("recall") is not None and baseline.get("recall") is not None) else None, - "f1": (kpi["f1"] - baseline["f1"]) if (kpi.get("f1") is not None and baseline.get("f1") is not None) else None, - } - cards_html_parts.append(render_kpi_card(f"Run {lbl}", kpi or {}, f"kpi-run-{lbl}", deltas=deltas)) - st.markdown('
' + "".join(cards_html_parts) + "
", unsafe_allow_html=True) - -if st.checkbox("Debug: Inspect Parquet (All Runs)" if not single_mode else "Debug: Inspect Parquet"): - cols_used = st.columns(len(target_files)) - file_labels = [(f"Run ({run_labels_list[i]}) File", target_files[i]) for i in range(len(target_files))] - schema_results = [] - for col, (label, file_path) in zip(cols_used, file_labels): - with col: - st.markdown(f"### {label}") - # Schema - schema_df = con.execute(""" - DESCRIBE SELECT * FROM read_parquet(?) - """, [file_path]).df() - schema_results.append((label, schema_df)) - st.write("**Schema (Column Names, Types)**") - st.markdown("Shows the schema (column names and their DuckDB/Parquet data types) of the selected Parquet file. Useful to check data structure and types as interpreted by DuckDB.") - st.dataframe(schema_df, width='stretch', hide_index=True) - - # Preview rows - row_options = [10, 20, 50, 100, 200, "All"] - preview_key = f"preview_row_limit_{label.replace(' ', '_').lower()}" - row_choice = st.selectbox(f"Preview rows to show ({label})", row_options, index=1, key=preview_key) - if row_choice == "All": - limit_clause = "" - else: - limit_clause = f"LIMIT {row_choice}" - preview_df = con.execute(f""" - SELECT * - FROM read_parquet(?) - {limit_clause} - """, [file_path]).df() - st.write(f"**Preview (First {row_choice} rows)**") - st.markdown(f"Shows the first {row_choice} preview rows from the Parquet file. Use this preview to examine example data contents and check that your file is as expected.") - st.dataframe(preview_df, width='stretch', hide_index=True) - - # Stats - stats_df = con.execute(""" - SELECT - COUNT(*) AS total_rows, - COUNT(t4dataset_id) AS non_null_ids, - COUNT(DISTINCT t4dataset_id) AS distinct_ids - FROM read_parquet(?) - """, [file_path]).df() - st.write("**Stats (Row Count, t4dataset_id non-null count, Distinct t4dataset_id count)**") - st.markdown(""" - - `total_rows`: Total rows in the file - - `non_null_ids`: Rows where t4dataset_id is not null - - `distinct_ids`: Unique t4dataset_id values - - This helps rapidly assess the completeness and distribution of the key ID field. - """) - st.dataframe(stats_df, width='stretch', hide_index=True) - - # --- Show info about schema differences (compare mode only) --- - if not single_mode and len(schema_results) >= 2: - with st.expander("⚖️ Difference between schemas", expanded=(len(schema_results) == 2)): - if len(schema_results) == 2: - label1, df1 = schema_results[0] - label2, df2 = schema_results[1] - names1 = set(df1["column_name"]) - names2 = set(df2["column_name"]) - added, removed = names2 - names1, names1 - names2 - common = names1 & names2 - types1 = {row["column_name"]: row["column_type"] for _, row in df1.iterrows()} - types2 = {row["column_name"]: row["column_type"] for _, row in df2.iterrows()} - dtype_changes = [(c, types1.get(c), types2.get(c)) for c in sorted(common) if types1.get(c) != types2.get(c)] - if not (added or removed or dtype_changes): - st.success("✅ The schemas are identical (column names and types match exactly).") + with ds_spot_loading("Summary · KPI metrics"): + kpis = [] + for i in range(len(runs)): + fc = build_filter_clause(filters_list[i]) + kpi = _kpi_row_for_view(con, _flat_view(i), fc) + kpis.append((run_labels_list[i], kpi)) + inject_detection_stats_kpi_styles() + baseline = kpis[0][1] if kpis else None + cards_html_parts = [] + for lbl, kpi in kpis: + deltas = None + if baseline and kpi and lbl != run_labels_list[0]: + deltas = { + "tp": kpi["tp"] - baseline["tp"], + "fp": kpi["fp"] - baseline["fp"], + "fn": kpi["fn"] - baseline["fn"], + "tpr": (kpi["tpr"] - baseline["tpr"]) if (kpi.get("tpr") is not None and baseline.get("tpr") is not None) else None, + "fpr": (kpi["fpr"] - baseline["fpr"]) if (kpi.get("fpr") is not None and baseline.get("fpr") is not None) else None, + "precision": (kpi["precision"] - baseline["precision"]) if (kpi.get("precision") is not None and baseline.get("precision") is not None) else None, + "recall": (kpi["recall"] - baseline["recall"]) if (kpi.get("recall") is not None and baseline.get("recall") is not None) else None, + "f1": (kpi["f1"] - baseline["f1"]) if (kpi.get("f1") is not None and baseline.get("f1") is not None) else None, + } + cards_html_parts.append(render_kpi_card(f"Run {lbl}", kpi or {}, f"kpi-run-{lbl}", deltas=deltas)) + st.markdown('
' + "".join(cards_html_parts) + "
", unsafe_allow_html=True) + + if st.checkbox("Debug: Inspect Parquet (All Runs)" if not single_mode else "Debug: Inspect Parquet"): + cols_used = st.columns(len(target_files)) + file_labels = [(f"Run ({run_labels_list[i]}) File", target_files[i]) for i in range(len(target_files))] + schema_results = [] + for col, (label, file_path) in zip(cols_used, file_labels): + with col: + st.markdown(f"### {label}") + # Schema + schema_df = con.execute(""" + DESCRIBE SELECT * FROM read_parquet(?) + """, [file_path]).df() + schema_results.append((label, schema_df)) + st.write("**Schema (Column Names, Types)**") + st.markdown("Shows the schema (column names and their DuckDB/Parquet data types) of the selected Parquet file. Useful to check data structure and types as interpreted by DuckDB.") + st.dataframe(schema_df, width='stretch', hide_index=True) + + # Preview rows + row_options = [10, 20, 50, 100, 200, "All"] + preview_key = f"preview_row_limit_{label.replace(' ', '_').lower()}" + row_choice = st.selectbox(f"Preview rows to show ({label})", row_options, index=1, key=preview_key) + if row_choice == "All": + limit_clause = "" + else: + limit_clause = f"LIMIT {row_choice}" + preview_df = con.execute(f""" + SELECT * + FROM read_parquet(?) + {limit_clause} + """, [file_path]).df() + st.write(f"**Preview (First {row_choice} rows)**") + st.markdown(f"Shows the first {row_choice} preview rows from the Parquet file. Use this preview to examine example data contents and check that your file is as expected.") + st.dataframe(preview_df, width='stretch', hide_index=True) + + # Stats + stats_df = con.execute(""" + SELECT + COUNT(*) AS total_rows, + COUNT(t4dataset_id) AS non_null_ids, + COUNT(DISTINCT t4dataset_id) AS distinct_ids + FROM read_parquet(?) + """, [file_path]).df() + st.write("**Stats (Row Count, t4dataset_id non-null count, Distinct t4dataset_id count)**") + st.markdown(""" + - `total_rows`: Total rows in the file + - `non_null_ids`: Rows where t4dataset_id is not null + - `distinct_ids`: Unique t4dataset_id values + + This helps rapidly assess the completeness and distribution of the key ID field. + """) + st.dataframe(stats_df, width='stretch', hide_index=True) + + # --- Show info about schema differences (compare mode only) --- + if not single_mode and len(schema_results) >= 2: + with st.expander("⚖️ Difference between schemas", expanded=(len(schema_results) == 2)): + if len(schema_results) == 2: + label1, df1 = schema_results[0] + label2, df2 = schema_results[1] + names1 = set(df1["column_name"]) + names2 = set(df2["column_name"]) + added, removed = names2 - names1, names1 - names2 + common = names1 & names2 + types1 = {row["column_name"]: row["column_type"] for _, row in df1.iterrows()} + types2 = {row["column_name"]: row["column_type"] for _, row in df2.iterrows()} + dtype_changes = [(c, types1.get(c), types2.get(c)) for c in sorted(common) if types1.get(c) != types2.get(c)] + if not (added or removed or dtype_changes): + st.success("✅ The schemas are identical (column names and types match exactly).") + else: + if added: + st.error(f"Columns only in `{label2}`: {', '.join(sorted(added))}") + if removed: + st.error(f"Columns only in `{label1}`: {', '.join(sorted(removed))}") + if dtype_changes: + st.warning("Columns with different types:") + st.dataframe(pd.DataFrame(dtype_changes, columns=["Column", f"Type in {label1}", f"Type in {label2}"]), width='stretch', hide_index=True) else: - if added: - st.error(f"Columns only in `{label2}`: {', '.join(sorted(added))}") - if removed: - st.error(f"Columns only in `{label1}`: {', '.join(sorted(removed))}") - if dtype_changes: - st.warning("Columns with different types:") - st.dataframe(pd.DataFrame(dtype_changes, columns=["Column", f"Type in {label1}", f"Type in {label2}"]), width='stretch', hide_index=True) + st.info(f"{len(schema_results)} runs loaded. Compare schemas per run in the columns above.") + + + + ds_dlog("section: Dataset_summary_status_distribution_try") + try: + with ds_spot_loading("Dataset summary & status distribution"): + if single_mode: + query_base = f""" + SELECT COUNT(DISTINCT t4dataset_id) AS id_num, '{os.path.basename(target_file)}' AS series + FROM view_eval_flat + """ + df_summary = con.execute(query_base).df() + query_status = """ + SELECT label, status, COUNT(*) AS num + FROM view_eval_flat + GROUP BY label, status + ORDER BY label, status + """ + df_status = con.execute(query_status).df() else: - st.info(f"{len(schema_results)} runs loaded. Compare schemas per run in the columns above.") - - - -try: - with ds_spot_loading("Dataset summary & status distribution"): + parts = [f"SELECT COUNT(DISTINCT t4dataset_id) AS id_num, '{run_labels_list[i]}' AS series FROM {_flat_view(i)}" for i in range(len(runs))] + query_base = " UNION ALL ".join(parts) + df_summary = con.execute(query_base).df() + parts_status = [f"SELECT '{run_labels_list[i]}' AS dataset, label, status, COUNT(*) AS num FROM {_flat_view(i)} GROUP BY label, status" for i in range(len(runs))] + query_status = " UNION ALL ".join(parts_status) + " ORDER BY dataset, label, status" + df_status = con.execute(query_status).df() + if single_mode: - query_base = f""" - SELECT COUNT(DISTINCT t4dataset_id) AS id_num, '{os.path.basename(target_file)}' AS series - FROM view_eval_flat - """ - df_summary = con.execute(query_base).df() - query_status = """ - SELECT label, status, COUNT(*) AS num - FROM view_eval_flat - GROUP BY label, status - ORDER BY label, status - """ - df_status = con.execute(query_status).df() - else: - parts = [f"SELECT COUNT(DISTINCT t4dataset_id) AS id_num, '{run_labels_list[i]}' AS series FROM {_flat_view(i)}" for i in range(len(runs))] - query_base = " UNION ALL ".join(parts) - df_summary = con.execute(query_base).df() - parts_status = [f"SELECT '{run_labels_list[i]}' AS dataset, label, status, COUNT(*) AS num FROM {_flat_view(i)} GROUP BY label, status" for i in range(len(runs))] - query_status = " UNION ALL ".join(parts_status) + " ORDER BY dataset, label, status" - df_status = con.execute(query_status).df() - - if single_mode: - if not df_status.empty: - if st.checkbox("Debug: Inspect Status Count (All Runs)" if not single_mode else "Debug: Inspect Status Count"): - df_status_wide = df_status.pivot_table(index='label', columns='status', values='num', fill_value=0).reset_index() - st.download_button("Download status count (CSV)", data=df_status_wide.to_csv(index=False).encode("utf-8"), file_name="detection_status_count.csv", mime="text/csv", key="dl_status_count") - st.dataframe(df_status_wide, width='stretch', hide_index=True) - status_viz = st.radio( - "Status chart style", - options=["Stacked bar (counts)", "Treemap", "100% stacked (proportions)", "Spider chart (TP, FP & FN)"], - index=0, - horizontal=True, - key="status_dist_viz", - ) - n_labels = df_status["label"].nunique() - use_horizontal = n_labels > 6 - if status_viz == "Stacked bar (counts)": - if use_horizontal: - fig2 = px.bar( - df_status, - y="label", - x="num", - color="status", - barmode="stack", - title="Status Distribution per Label", - labels={"num": "Count", "label": "Label", "status": "Status"}, - color_discrete_map=STATUS_COLORS, - orientation="h", - ) - else: - fig2 = px.bar( - df_status, - x="label", - y="num", - color="status", - barmode="stack", - title="Status Distribution per Label", - labels={"num": "Count", "label": "Label", "status": "Status"}, - color_discrete_map=STATUS_COLORS, - ) - apply_chart_theme(fig2) - st.plotly_chart(fig2, width='stretch') - elif status_viz == "Treemap": - fig2 = px.treemap( - df_status, - path=["label", "status"], - values="num", - color="status", - color_discrete_map=STATUS_COLORS, - title="Status Distribution per Label (area = count)", - ) - fig2.update_traces( - textinfo="label+value+percent parent", - hovertemplate="%{label}
Count: %{value}", + if not df_status.empty: + if st.checkbox("Debug: Inspect Status Count (All Runs)" if not single_mode else "Debug: Inspect Status Count"): + df_status_wide = df_status.pivot_table(index='label', columns='status', values='num', fill_value=0).reset_index() + st.download_button("Download status count (CSV)", data=df_status_wide.to_csv(index=False).encode("utf-8"), file_name="detection_status_count.csv", mime="text/csv", key="dl_status_count") + st.dataframe(df_status_wide, width='stretch', hide_index=True) + status_viz = st.radio( + "Status chart style", + options=["Stacked bar (counts)", "Treemap", "100% stacked (proportions)", "Spider chart (TP, FP & FN)"], + index=0, + horizontal=True, + key="status_dist_viz", ) - apply_chart_theme(fig2, height=420) - st.plotly_chart(fig2, width='stretch') - elif status_viz == "Spider chart (TP, FP & FN)": - wide = df_status.pivot_table(index="label", columns="status", values="num", fill_value=0) - cats = sorted(wide.index.astype(str).unique()) - if len(cats) > 16: - st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") - run_single = [os.path.basename(target_file) if target_file else "Run"] - rcols = st.columns(3) - for col_i, st_name in enumerate(["TP", "FP", "FN"]): - vals = wide[st_name] if st_name in wide.columns else pd.Series(0, index=wide.index) - df_m = pd.DataFrame({"label": wide.index.astype(str), "count": vals.values}) - df_m["run"] = run_single[0] - fig_r = _count_spider_compare( - df_m, - cats, - f"{st_name} count per label", - run_single, - f"{st_name} count", - ) - with rcols[col_i]: - st.plotly_chart(fig_r, width='stretch') - else: - # 100% stacked: proportion per label - wide = df_status.pivot_table(index="label", columns="status", values="num", fill_value=0) - wide_pct = wide.div(wide.sum(axis=1), axis=0) - df_pct = wide_pct.reset_index().melt(id_vars="label", var_name="status", value_name="pct") - df_pct = df_pct[df_pct["pct"] > 0] - if not df_pct.empty: + n_labels = df_status["label"].nunique() + use_horizontal = n_labels > 6 + if status_viz == "Stacked bar (counts)": if use_horizontal: fig2 = px.bar( - df_pct, + df_status, y="label", - x="pct", + x="num", color="status", barmode="stack", - title="Status proportion per Label (100% stacked)", - labels={"pct": "Proportion", "label": "Label", "status": "Status"}, + title="Status Distribution per Label", + labels={"num": "Count", "label": "Label", "status": "Status"}, color_discrete_map=STATUS_COLORS, orientation="h", ) else: fig2 = px.bar( - df_pct, + df_status, x="label", - y="pct", + y="num", color="status", barmode="stack", - title="Status proportion per Label (100% stacked)", - labels={"pct": "Proportion", "label": "Label", "status": "Status"}, + title="Status Distribution per Label", + labels={"num": "Count", "label": "Label", "status": "Status"}, color_discrete_map=STATUS_COLORS, ) apply_chart_theme(fig2) - if use_horizontal: - fig2.update_layout(xaxis_tickformat=".0%", xaxis_range=[0, 1]) - else: - fig2.update_layout(yaxis_tickformat=".0%", yaxis_range=[0, 1]) st.plotly_chart(fig2, width='stretch') - else: - st.info("No data for proportions.") - else: - st.info("No status count data available") - else: - if not df_status.empty: - if st.checkbox("Debug: Inspect Status Count (All Runs)" if not single_mode else "Debug: Inspect Status Count"): - df_status_wide = df_status.pivot_table(index='label', columns=['dataset', 'status'], values='num', fill_value=0) - df_status_wide.columns = [f"{col[0]} {col[1]}" for col in df_status_wide.columns] - df_status_wide = df_status_wide.reset_index() - st.dataframe(df_status_wide, width='stretch', hide_index=True) - status_viz = st.radio( - "Status chart style", - options=["Stacked bar (counts)", "Treemap", "100% stacked (proportions)", "Spider chart (TP, FP & FN)"], - index=0, - horizontal=True, - key="status_dist_viz_compare", - ) - if status_viz == "Stacked bar (counts)": - fig2 = px.bar( - df_status, - x="label", - y="num", - color="status", - barmode="stack", - facet_col="dataset", - title="Status Distribution per Label (by Run)", - category_orders={"dataset": run_labels_list}, - labels={"num": "Count", "label": "Label", "status": "Status"}, - color_discrete_map=STATUS_COLORS, - ) - apply_chart_theme(fig2) - st.plotly_chart(fig2, width='stretch') - elif status_viz == "Spider chart (TP, FP & FN)": - # Same counts as stacked bar: one spider per status (TP / FP / FN), axes = labels, r = count - status_wide = df_status.pivot_table( - index=["dataset", "label"], columns="status", values="num", fill_value=0 - ).reset_index() - cats = sorted(df_status["label"].astype(str).unique()) - if len(cats) > 16: - st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") - rcols = st.columns(3) - for col_i, st_name in enumerate(["TP", "FP", "FN"]): - col_data = ( - status_wide[st_name] - if st_name in status_wide.columns - else pd.Series(0, index=status_wide.index) - ) - df_m = pd.DataFrame( - { - "run": status_wide["dataset"].astype(str), - "label": status_wide["label"].astype(str), - "count": col_data.values, - } + elif status_viz == "Treemap": + fig2 = px.treemap( + df_status, + path=["label", "status"], + values="num", + color="status", + color_discrete_map=STATUS_COLORS, + title="Status Distribution per Label (area = count)", ) - fig_r = _count_spider_compare( - df_m, - cats, - f"{st_name} count per label (by run)", - run_labels_list, - f"{st_name} count", + fig2.update_traces( + textinfo="label+value+percent parent", + hovertemplate="%{label}
Count: %{value}", ) - with rcols[col_i]: - st.plotly_chart(fig_r, width='stretch') - elif status_viz == "Treemap": - n_runs = len(run_labels_list) - cols = st.columns(min(n_runs, 3)) - for idx, lbl in enumerate(run_labels_list): - df_r = df_status[df_status["dataset"] == lbl] - if not df_r.empty: - fig_t = px.treemap( - df_r, - path=["label", "status"], - values="num", - color="status", - color_discrete_map=STATUS_COLORS, - title=f"{lbl}", - ) - fig_t.update_traces( - textinfo="label+value+percent parent", - hovertemplate="%{label}
Count: %{value}", + apply_chart_theme(fig2, height=420) + st.plotly_chart(fig2, width='stretch') + elif status_viz == "Spider chart (TP, FP & FN)": + wide = df_status.pivot_table(index="label", columns="status", values="num", fill_value=0) + cats = sorted(wide.index.astype(str).unique()) + if len(cats) > 16: + st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") + run_single = [os.path.basename(target_file) if target_file else "Run"] + rcols = st.columns(3) + for col_i, st_name in enumerate(["TP", "FP", "FN"]): + vals = wide[st_name] if st_name in wide.columns else pd.Series(0, index=wide.index) + df_m = pd.DataFrame({"label": wide.index.astype(str), "count": vals.values}) + df_m["run"] = run_single[0] + fig_r = _count_spider_compare( + df_m, + cats, + f"{st_name} count per label", + run_single, + f"{st_name} count", ) - apply_chart_theme(fig_t, height=360) - with cols[idx % len(cols)]: - st.plotly_chart(fig_t, width='stretch') - else: - # 100% stacked per run (facet) - df_pct_list = [] - for lbl in run_labels_list: - df_r = df_status[df_status["dataset"] == lbl] - wide = df_r.pivot_table(index="label", columns="status", values="num", fill_value=0) - if wide.empty: - continue + with rcols[col_i]: + st.plotly_chart(fig_r, width='stretch') + else: + # 100% stacked: proportion per label + wide = df_status.pivot_table(index="label", columns="status", values="num", fill_value=0) wide_pct = wide.div(wide.sum(axis=1), axis=0) - wide_pct["dataset"] = lbl - wide_pct = wide_pct.reset_index() - df_pct_list.append(wide_pct) - if df_pct_list: - wide_all = pd.concat(df_pct_list, ignore_index=True) - df_pct_melt = wide_all.melt( - id_vars=["label", "dataset"], - value_vars=[c for c in wide_all.columns if c not in ("label", "dataset")], - var_name="status", - value_name="pct", - ) - df_pct_melt = df_pct_melt[df_pct_melt["pct"] > 0] - if not df_pct_melt.empty: - fig2 = px.bar( - df_pct_melt, - x="label", - y="pct", - color="status", - barmode="stack", - facet_col="dataset", - category_orders={"dataset": run_labels_list}, - title="Status proportion per Label (100% stacked, by Run)", - labels={"pct": "Proportion", "label": "Label", "status": "Status"}, - color_discrete_map=STATUS_COLORS, - ) + df_pct = wide_pct.reset_index().melt(id_vars="label", var_name="status", value_name="pct") + df_pct = df_pct[df_pct["pct"] > 0] + if not df_pct.empty: + if use_horizontal: + fig2 = px.bar( + df_pct, + y="label", + x="pct", + color="status", + barmode="stack", + title="Status proportion per Label (100% stacked)", + labels={"pct": "Proportion", "label": "Label", "status": "Status"}, + color_discrete_map=STATUS_COLORS, + orientation="h", + ) + else: + fig2 = px.bar( + df_pct, + x="label", + y="pct", + color="status", + barmode="stack", + title="Status proportion per Label (100% stacked)", + labels={"pct": "Proportion", "label": "Label", "status": "Status"}, + color_discrete_map=STATUS_COLORS, + ) apply_chart_theme(fig2) - fig2.update_layout( - yaxis_tickformat=".0%", - yaxis_range=[0, 1], - ) - for ann in fig2.layout.annotations: - ann.text = ann.text.split("=")[-1] + if use_horizontal: + fig2.update_layout(xaxis_tickformat=".0%", xaxis_range=[0, 1]) + else: + fig2.update_layout(yaxis_tickformat=".0%", yaxis_range=[0, 1]) st.plotly_chart(fig2, width='stretch') else: st.info("No data for proportions.") - else: - st.info("No data for proportions.") - else: - st.info("No status count data available") - -except Exception as e: - st.error(f"Error in summary: {e}") - - - -def _tpr_fpr_view(i: int) -> str: - return "view_tpr_fpr_by_class_dist_topic" if i == 0 else f"view_tpr_fpr_{i}" - - -def _distance_bin_order_and_label(bin_str: str) -> Tuple[int, str]: - """Parse distance_bin e.g. '[0,10)' -> (0, '0–10 m'). Used for sorting and axis labels.""" - import re - s = str(bin_str).strip() - m = re.match(r"\[(\d+)\s*,\s*(\d+)\)", s) - if m: - lo, hi = int(m.group(1)), int(m.group(2)) - return (lo, f"{lo}–{hi} m") - m = re.match(r"\[(\d+)\s*,\s*inf\)", s, re.I) - if m: - return (int(m.group(1)), f"{m.group(1)}+ m") - return (0, s) - - -# Same 10 m bins as view_tpr_fpr / eval_flat (used for object-count alignment) -_DIST_BIN_CASE = """CASE - WHEN dist_h >= 0 AND dist_h < 10 THEN '[0,10)' - WHEN dist_h >= 10 AND dist_h < 20 THEN '[10,20)' - WHEN dist_h >= 20 AND dist_h < 30 THEN '[20,30)' - WHEN dist_h >= 30 AND dist_h < 40 THEN '[30,40)' - WHEN dist_h >= 40 AND dist_h < 50 THEN '[40,50)' - WHEN dist_h >= 50 AND dist_h < 60 THEN '[50,60)' - WHEN dist_h >= 60 AND dist_h < 70 THEN '[60,70)' - WHEN dist_h >= 70 AND dist_h < 80 THEN '[70,80)' - WHEN dist_h >= 80 AND dist_h < 90 THEN '[80,90)' - WHEN dist_h >= 90 AND dist_h < 100 THEN '[90,100)' - WHEN dist_h >= 100 AND dist_h < 110 THEN '[100,110)' - WHEN dist_h >= 110 AND dist_h < 120 THEN '[110,120)' - WHEN dist_h >= 120 AND dist_h < 130 THEN '[120,130)' - WHEN dist_h >= 130 AND dist_h < 140 THEN '[130,140)' - WHEN dist_h >= 140 AND dist_h < 150 THEN '[140,150)' - WHEN dist_h >= 150 THEN '[150,inf)' - ELSE '[unknown]' END""" - - -# ============================= -# Panel 3–5: Distance — TP/FP rates by bin + object count vs range -# ============================= -st.divider() -st.markdown( - section_header_html( - "Distance: TP/FP rates & object count", - "Same distance bins and chart style (line or bar) for rates and object counts; x-axis order matches across charts.", - ), - unsafe_allow_html=True, -) -rate_by_dist_style = st.radio( - "Chart style", - options=["Line chart (trend)", "Bar chart (histogram)"], - index=0, - horizontal=True, - key="tp_fp_rate_by_dist_style", -) - -filter_clause_base = build_filter_clause(filters_base, enable_dist_h=False) -_dist_slot = st.empty() -_dist_slot.markdown(ds_spot_loading_markup("Distance · TP/FP rates & object counts"), unsafe_allow_html=True) -try: - use_line_chart = rate_by_dist_style == "Line chart (trend)" - rate_bin_labels_order: Optional[List[str]] = None - - if single_mode: - # Fetch both TP and FP rate by distance - query_both = f""" - SELECT - distance_bin, - CASE WHEN SUM(gt_total) > 0 THEN CAST(SUM(tp_gt) AS DOUBLE) / SUM(gt_total) ELSE 0 END AS tpr, - CASE WHEN SUM(est_total) > 0 THEN CAST(SUM(fp_est) AS DOUBLE) / SUM(est_total) ELSE 0 END AS fpr - FROM view_tpr_fpr_by_class_dist_topic - WHERE {filter_clause_base} - GROUP BY distance_bin - ORDER BY CAST(REPLACE(SPLIT_PART(distance_bin, ',', 1), '[', ' ') AS INTEGER) - """ - df_both = con.execute(query_both).df() - if not df_both.empty: - df_both["bin_order"], df_both["bin_label"] = zip( - *df_both["distance_bin"].map(_distance_bin_order_and_label) - ) - df_both = df_both.sort_values("bin_order") - x_labels = df_both["bin_label"].tolist() - rate_bin_labels_order = x_labels - - if use_line_chart: - fig = go.Figure() - fig.add_trace( - go.Scatter( - x=x_labels, - y=df_both["tpr"], - name="TP rate", - mode="lines", - line=dict(color=RUN_COLORS[0], width=2.5, shape="spline"), - fill="tozeroy", - fillcolor="rgba(74, 144, 217, 0.2)", - hovertemplate="%{x}
TP rate: %{y:.2%}", - ) - ) - fig.add_trace( - go.Scatter( - x=x_labels, - y=df_both["fpr"], - name="FP rate", - mode="lines", - line=dict(color=RUN_COLORS[1], width=2.5, shape="spline"), - fill="tozeroy", - fillcolor="rgba(232, 106, 51, 0.2)", - hovertemplate="%{x}
FP rate: %{y:.2%}", - ) - ) - apply_chart_theme(fig, height=420) - fig.update_layout( - title=f"TP & FP rate by distance (within {max_eval_range} m)", - xaxis_title="Distance bin", - yaxis_title="Rate", - yaxis_range=[0, 1], - xaxis=dict( - tickangle=-35, - categoryorder="array", - categoryarray=x_labels, - ), - hovermode="x unified", - ) - fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") - st.plotly_chart(fig, width='stretch') else: - # Bar chart (histogram): combined TP + FP grouped bars - fig = go.Figure() - fig.add_trace( - go.Bar( - x=x_labels, - y=df_both["tpr"], - name="TP rate", - marker_color=RUN_COLORS[0], - hovertemplate="%{x}
TP rate: %{y:.2%}", - ) + st.info("No status count data available") + else: + if not df_status.empty: + if st.checkbox("Debug: Inspect Status Count (All Runs)" if not single_mode else "Debug: Inspect Status Count"): + df_status_wide = df_status.pivot_table(index='label', columns=['dataset', 'status'], values='num', fill_value=0) + df_status_wide.columns = [f"{col[0]} {col[1]}" for col in df_status_wide.columns] + df_status_wide = df_status_wide.reset_index() + st.dataframe(df_status_wide, width='stretch', hide_index=True) + status_viz = st.radio( + "Status chart style", + options=["Stacked bar (counts)", "Treemap", "100% stacked (proportions)", "Spider chart (TP, FP & FN)"], + index=0, + horizontal=True, + key="status_dist_viz_compare", ) - fig.add_trace( - go.Bar( - x=x_labels, - y=df_both["fpr"], - name="FP rate", - marker_color=RUN_COLORS[1], - hovertemplate="%{x}
FP rate: %{y:.2%}", + if status_viz == "Stacked bar (counts)": + fig2 = px.bar( + df_status, + x="label", + y="num", + color="status", + barmode="stack", + facet_col="dataset", + title="Status Distribution per Label (by Run)", + category_orders={"dataset": run_labels_list}, + labels={"num": "Count", "label": "Label", "status": "Status"}, + color_discrete_map=STATUS_COLORS, ) - ) - apply_chart_theme(fig, height=420) - fig.update_layout( - title=f"TP & FP rate by distance (within {max_eval_range} m)", - xaxis_title="Distance bin", - yaxis_title="Rate", - yaxis_range=[0, 1], - barmode="group", - xaxis=dict( - tickangle=-35, - categoryorder="array", - categoryarray=x_labels, - ), - hovermode="x unified", - ) - fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") - st.plotly_chart(fig, width='stretch') - else: - st.info("No distance-bin data available.") - else: - # Compare mode: fetch TP and FP by distance per run - dfs_tpr = [] - for i in range(len(runs)): - fc = build_filter_clause(filters_list[i], enable_dist_h=False) - q = f""" - SELECT distance_bin, - CASE WHEN SUM(gt_total) > 0 THEN CAST(SUM(tp_gt) AS DOUBLE) / SUM(gt_total) ELSE 0 END AS tpr - FROM {_tpr_fpr_view(i)} - WHERE {fc} - GROUP BY distance_bin - ORDER BY CAST(REPLACE(SPLIT_PART(distance_bin, ',', 1), '[', ' ') AS INTEGER) - """ - df_i = con.execute(q).df() - df_i["run"] = run_labels_list[i] - df_i["bin_order"], df_i["bin_label"] = zip(*df_i["distance_bin"].map(_distance_bin_order_and_label)) - df_i = df_i.sort_values("bin_order") - dfs_tpr.append(df_i) - df_tpr_dist = pd.concat(dfs_tpr, ignore_index=True) - - dfs_fpr = [] - for i in range(len(runs)): - fc = build_filter_clause(filters_list[i], enable_dist_h=False) - q = f""" - SELECT distance_bin, - CASE WHEN SUM(est_total) > 0 THEN CAST(SUM(fp_est) AS DOUBLE) / SUM(est_total) ELSE 0 END AS fpr - FROM {_tpr_fpr_view(i)} - WHERE {fc} - GROUP BY distance_bin - ORDER BY CAST(REPLACE(SPLIT_PART(distance_bin, ',', 1), '[', ' ') AS INTEGER) - """ - df_i = con.execute(q).df() - df_i["run"] = run_labels_list[i] - df_i["bin_order"], df_i["bin_label"] = zip(*df_i["distance_bin"].map(_distance_bin_order_and_label)) - df_i = df_i.sort_values("bin_order") - dfs_fpr.append(df_i) - df_fpr_dist = pd.concat(dfs_fpr, ignore_index=True) - - if not df_tpr_dist.empty: - rate_bin_labels_order = ( - df_tpr_dist[df_tpr_dist["run"] == run_labels_list[0]] - .sort_values("bin_order")["bin_label"] - .tolist() - ) - _xaxis_dist_bins = ( - dict(tickangle=-35, categoryorder="array", categoryarray=rate_bin_labels_order) - if rate_bin_labels_order - else dict(tickangle=-35) - ) - - if use_line_chart: - if not df_tpr_dist.empty: - fig_tpr = go.Figure() - for i, lbl in enumerate(run_labels_list): - d = df_tpr_dist[df_tpr_dist["run"] == lbl].sort_values("bin_order") - c = RUN_COLORS[i % len(RUN_COLORS)] - r, g, b = int(c[1:3], 16), int(c[3:5], 16), int(c[5:7], 16) - fig_tpr.add_trace( - go.Scatter( - x=d["bin_label"], - y=d["tpr"], - name=lbl, - mode="lines", - line=dict(color=c, width=2.2, shape="spline"), - fill="tozeroy", - fillcolor=f"rgba({r},{g},{b},0.15)", - hovertemplate=f"{lbl}
%{{x}}
TP rate: %{{y:.2%}}", + apply_chart_theme(fig2) + st.plotly_chart(fig2, width='stretch') + elif status_viz == "Spider chart (TP, FP & FN)": + # Same counts as stacked bar: one spider per status (TP / FP / FN), axes = labels, r = count + status_wide = df_status.pivot_table( + index=["dataset", "label"], columns="status", values="num", fill_value=0 + ).reset_index() + cats = sorted(df_status["label"].astype(str).unique()) + if len(cats) > 16: + st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") + rcols = st.columns(3) + for col_i, st_name in enumerate(["TP", "FP", "FN"]): + col_data = ( + status_wide[st_name] + if st_name in status_wide.columns + else pd.Series(0, index=status_wide.index) ) - ) - apply_chart_theme(fig_tpr, height=420) - fig_tpr.update_layout( - title=f"TP rate by distance", - xaxis_title="Distance bin", - yaxis_title="TP rate", - yaxis_range=[0, 1], - xaxis=_xaxis_dist_bins, - hovermode="x unified", - ) - fig_tpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") - st.plotly_chart(fig_tpr, width='stretch') - else: - st.info("No TP rate by distance data.") - - if not df_fpr_dist.empty: - fig_fpr = go.Figure() - for i, lbl in enumerate(run_labels_list): - d = df_fpr_dist[df_fpr_dist["run"] == lbl].sort_values("bin_order") - c = RUN_COLORS[i % len(RUN_COLORS)] - r, g, b = int(c[1:3], 16), int(c[3:5], 16), int(c[5:7], 16) - fig_fpr.add_trace( - go.Scatter( - x=d["bin_label"], - y=d["fpr"], - name=lbl, - mode="lines", - line=dict(color=c, width=2.2, shape="spline"), - fill="tozeroy", - fillcolor=f"rgba({r},{g},{b},0.15)", - hovertemplate=f"{lbl}
%{{x}}
FP rate: %{{y:.2%}}", + df_m = pd.DataFrame( + { + "run": status_wide["dataset"].astype(str), + "label": status_wide["label"].astype(str), + "count": col_data.values, + } ) - ) - apply_chart_theme(fig_fpr, height=420) - fig_fpr.update_layout( - title=f"FP rate by distance", - xaxis_title="Distance bin", - yaxis_title="FP rate", - yaxis_range=[0, 1], - xaxis=_xaxis_dist_bins, - hovermode="x unified", - ) - fig_fpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") - st.plotly_chart(fig_fpr, width='stretch') - else: - st.info("No FP rate by distance data.") - else: - # Bar chart (histogram) for compare: TP then FP, grouped by run - if not df_tpr_dist.empty: - fig_tpr = go.Figure() - for i, lbl in enumerate(run_labels_list): - d = df_tpr_dist[df_tpr_dist["run"] == lbl].sort_values("bin_order") - fig_tpr.add_trace( + fig_r = _count_spider_compare( + df_m, + cats, + f"{st_name} count per label (by run)", + run_labels_list, + f"{st_name} count", + ) + with rcols[col_i]: + st.plotly_chart(fig_r, width='stretch') + elif status_viz == "Treemap": + n_runs = len(run_labels_list) + cols = st.columns(min(n_runs, 3)) + for idx, lbl in enumerate(run_labels_list): + df_r = df_status[df_status["dataset"] == lbl] + if not df_r.empty: + fig_t = px.treemap( + df_r, + path=["label", "status"], + values="num", + color="status", + color_discrete_map=STATUS_COLORS, + title=f"{lbl}", + ) + fig_t.update_traces( + textinfo="label+value+percent parent", + hovertemplate="%{label}
Count: %{value}", + ) + apply_chart_theme(fig_t, height=360) + with cols[idx % len(cols)]: + st.plotly_chart(fig_t, width='stretch') + else: + # 100% stacked per run (facet) + df_pct_list = [] + for lbl in run_labels_list: + df_r = df_status[df_status["dataset"] == lbl] + wide = df_r.pivot_table(index="label", columns="status", values="num", fill_value=0) + if wide.empty: + continue + wide_pct = wide.div(wide.sum(axis=1), axis=0) + wide_pct["dataset"] = lbl + wide_pct = wide_pct.reset_index() + df_pct_list.append(wide_pct) + if df_pct_list: + wide_all = pd.concat(df_pct_list, ignore_index=True) + df_pct_melt = wide_all.melt( + id_vars=["label", "dataset"], + value_vars=[c for c in wide_all.columns if c not in ("label", "dataset")], + var_name="status", + value_name="pct", + ) + df_pct_melt = df_pct_melt[df_pct_melt["pct"] > 0] + if not df_pct_melt.empty: + fig2 = px.bar( + df_pct_melt, + x="label", + y="pct", + color="status", + barmode="stack", + facet_col="dataset", + category_orders={"dataset": run_labels_list}, + title="Status proportion per Label (100% stacked, by Run)", + labels={"pct": "Proportion", "label": "Label", "status": "Status"}, + color_discrete_map=STATUS_COLORS, + ) + apply_chart_theme(fig2) + fig2.update_layout( + yaxis_tickformat=".0%", + yaxis_range=[0, 1], + ) + for ann in fig2.layout.annotations: + ann.text = ann.text.split("=")[-1] + st.plotly_chart(fig2, width='stretch') + else: + st.info("No data for proportions.") + else: + st.info("No data for proportions.") + else: + st.info("No status count data available") + + except Exception as e: + st.error(f"Error in summary: {e}") + + + + def _distance_bin_order_and_label(bin_str: str) -> Tuple[int, str]: + """Parse distance_bin e.g. '[0,10)' -> (0, '0–10 m'). Used for sorting and axis labels.""" + import re + s = str(bin_str).strip() + m = re.match(r"\[(\d+)\s*,\s*(\d+)\)", s) + if m: + lo, hi = int(m.group(1)), int(m.group(2)) + return (lo, f"{lo}–{hi} m") + m = re.match(r"\[(\d+)\s*,\s*inf\)", s, re.I) + if m: + return (int(m.group(1)), f"{m.group(1)}+ m") + return (0, s) + + + # Same 10 m bins as eval_flat / TPR-FPR stats (used for object-count alignment) + _DIST_BIN_CASE = """CASE + WHEN dist_h >= 0 AND dist_h < 10 THEN '[0,10)' + WHEN dist_h >= 10 AND dist_h < 20 THEN '[10,20)' + WHEN dist_h >= 20 AND dist_h < 30 THEN '[20,30)' + WHEN dist_h >= 30 AND dist_h < 40 THEN '[30,40)' + WHEN dist_h >= 40 AND dist_h < 50 THEN '[40,50)' + WHEN dist_h >= 50 AND dist_h < 60 THEN '[50,60)' + WHEN dist_h >= 60 AND dist_h < 70 THEN '[60,70)' + WHEN dist_h >= 70 AND dist_h < 80 THEN '[70,80)' + WHEN dist_h >= 80 AND dist_h < 90 THEN '[80,90)' + WHEN dist_h >= 90 AND dist_h < 100 THEN '[90,100)' + WHEN dist_h >= 100 AND dist_h < 110 THEN '[100,110)' + WHEN dist_h >= 110 AND dist_h < 120 THEN '[110,120)' + WHEN dist_h >= 120 AND dist_h < 130 THEN '[120,130)' + WHEN dist_h >= 130 AND dist_h < 140 THEN '[130,140)' + WHEN dist_h >= 140 AND dist_h < 150 THEN '[140,150)' + WHEN dist_h >= 150 THEN '[150,inf)' + ELSE '[unknown]' END""" + + + # ============================= + # Panel 3–5: Distance — TP/FP rates by bin + object count vs range + # ============================= + ds_dlog("section: Panel3_5_Distance_start") + st.divider() + st.markdown( + section_header_html( + "Distance: TP/FP rates & object count", + "Same distance bins and chart style (line or bar) for rates and object counts; x-axis order matches across charts.", + ), + unsafe_allow_html=True, + ) + rate_by_dist_style = st.radio( + "Chart style", + options=["Line chart (trend)", "Bar chart (histogram)"], + index=0, + horizontal=True, + key="tp_fp_rate_by_dist_style", + ) + + filter_clause_base = build_filter_clause(filters_base, enable_dist_h=False) + ds_dlog( + "distance: filter_clause_base (no dist_h) len=%s preview=%s", + len(filter_clause_base), + filter_clause_base[:600], + ) + _dist_slot = st.empty() + _dist_slot.markdown(ds_spot_loading_markup("Distance · TP/FP rates & object counts"), unsafe_allow_html=True) + try: + ds_dlog("distance_inner_try: single_mode=%s", single_mode) + ds_debug_log_memory("distance_inner_try_start") + use_line_chart = rate_by_dist_style == "Line chart (trend)" + rate_bin_labels_order: Optional[List[str]] = None + + if single_mode: + # Inline stats from view_eval_flat (avoid nested TPR/FPR view — DuckDB can SIGSEGV on that plan). + query_both = sql_distance_bin_rates_from_eval_flat( + "view_eval_flat", filter_clause_base, metrics="both" + ) + ds_dlog("distance: executing query_both (single_mode TPR/FPR by bin, inlined from eval_flat)") + df_both = con.execute(query_both).df() + ds_dlog("distance: query_both done rows=%s cols=%s", len(df_both), list(df_both.columns)) + ds_debug_log_memory("distance_after_query_both") + if not df_both.empty: + df_both["bin_order"], df_both["bin_label"] = zip( + *df_both["distance_bin"].map(_distance_bin_order_and_label) + ) + df_both = df_both.sort_values("bin_order") + x_labels = df_both["bin_label"].tolist() + rate_bin_labels_order = x_labels + + if use_line_chart: + fig = go.Figure() + fig.add_trace( + go.Scatter( + x=x_labels, + y=df_both["tpr"], + name="TP rate", + mode="lines", + line=dict(color=RUN_COLORS[0], width=2.5, shape="spline"), + fill="tozeroy", + fillcolor="rgba(74, 144, 217, 0.2)", + hovertemplate="%{x}
TP rate: %{y:.2%}", + ) + ) + fig.add_trace( + go.Scatter( + x=x_labels, + y=df_both["fpr"], + name="FP rate", + mode="lines", + line=dict(color=RUN_COLORS[1], width=2.5, shape="spline"), + fill="tozeroy", + fillcolor="rgba(232, 106, 51, 0.2)", + hovertemplate="%{x}
FP rate: %{y:.2%}", + ) + ) + apply_chart_theme(fig, height=420) + fig.update_layout( + title=f"TP & FP rate by distance (within {max_eval_range} m)", + xaxis_title="Distance bin", + yaxis_title="Rate", + yaxis_range=[0, 1], + xaxis=dict( + tickangle=-35, + categoryorder="array", + categoryarray=x_labels, + ), + hovermode="x unified", + ) + fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + st.plotly_chart(fig, width='stretch') + else: + # Bar chart (histogram): combined TP + FP grouped bars + fig = go.Figure() + fig.add_trace( go.Bar( - x=d["bin_label"], - y=d["tpr"], - name=lbl, - marker_color=RUN_COLORS[i % len(RUN_COLORS)], - hovertemplate=f"{lbl}
%{{x}}
TP rate: %{{y:.2%}}", + x=x_labels, + y=df_both["tpr"], + name="TP rate", + marker_color=RUN_COLORS[0], + hovertemplate="%{x}
TP rate: %{y:.2%}", ) ) - apply_chart_theme(fig_tpr, height=420) - fig_tpr.update_layout( - title=f"TP rate by distance", - xaxis_title="Distance bin", - yaxis_title="TP rate", - yaxis_range=[0, 1], - barmode="group", - xaxis=_xaxis_dist_bins, - hovermode="x unified", - ) - fig_tpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") - st.plotly_chart(fig_tpr, width='stretch') - else: - st.info("No TP rate by distance data.") - - if not df_fpr_dist.empty: - fig_fpr = go.Figure() - for i, lbl in enumerate(run_labels_list): - d = df_fpr_dist[df_fpr_dist["run"] == lbl].sort_values("bin_order") - fig_fpr.add_trace( + fig.add_trace( go.Bar( - x=d["bin_label"], - y=d["fpr"], - name=lbl, - marker_color=RUN_COLORS[i % len(RUN_COLORS)], - hovertemplate=f"{lbl}
%{{x}}
FP rate: %{{y:.2%}}", + x=x_labels, + y=df_both["fpr"], + name="FP rate", + marker_color=RUN_COLORS[1], + hovertemplate="%{x}
FP rate: %{y:.2%}", ) ) - apply_chart_theme(fig_fpr, height=420) - fig_fpr.update_layout( - title=f"FP rate by distance", - xaxis_title="Distance bin", - yaxis_title="FP rate", - yaxis_range=[0, 1], - barmode="group", - xaxis=_xaxis_dist_bins, - hovermode="x unified", - ) - fig_fpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") - st.plotly_chart(fig_fpr, width='stretch') + apply_chart_theme(fig, height=420) + fig.update_layout( + title=f"TP & FP rate by distance (within {max_eval_range} m)", + xaxis_title="Distance bin", + yaxis_title="Rate", + yaxis_range=[0, 1], + barmode="group", + xaxis=dict( + tickangle=-35, + categoryorder="array", + categoryarray=x_labels, + ), + hovermode="x unified", + ) + fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + st.plotly_chart(fig, width='stretch') else: - st.info("No FP rate by distance data.") - - # Object count by same distance bins as TP/FP; same line vs bar style; aligned x-axis - - try: - if single_mode: - q_oc = f""" - SELECT ({_DIST_BIN_CASE}) AS distance_bin, label, COUNT(*) AS n - FROM view_eval_flat - WHERE {filter_clause_base} - GROUP BY 1, 2 - """ - df_oc = con.execute(q_oc).df() + st.info("No distance-bin data available.") else: - dfs_oc = [] + # Compare mode: fetch TP and FP by distance per run + ds_dlog("distance: compare_mode n_runs=%s", len(runs)) + dfs_tpr = [] for i in range(len(runs)): - fc_oc = build_filter_clause(filters_list[i], enable_dist_h=False) - q_oc_i = f""" - SELECT ({_DIST_BIN_CASE}) AS distance_bin, COUNT(*) AS n - FROM {_flat_view(i)} - WHERE {fc_oc} - GROUP BY 1 - """ - df_oci = con.execute(q_oc_i).df() - df_oci["run"] = run_labels_list[i] - dfs_oc.append(df_oci) - df_oc = pd.concat(dfs_oc, ignore_index=True) - - if df_oc.empty: - st.info("No object count data by distance bin.") - else: - df_oc = df_oc.copy() - df_oc["bin_order"], df_oc["bin_label"] = zip(*df_oc["distance_bin"].map(_distance_bin_order_and_label)) - if rate_bin_labels_order: - align_x = list(rate_bin_labels_order) - else: - align_x = ( - df_oc.drop_duplicates("distance_bin") + fc = build_filter_clause(filters_list[i], enable_dist_h=False) + q = sql_distance_bin_rates_from_eval_flat(_flat_view(i), fc, metrics="tpr") + ds_dlog("distance: compare run %s/%s TPR by bin query", i + 1, len(runs)) + df_i = con.execute(q).df() + ds_dlog("distance: compare TPR query run %s rows=%s", i, len(df_i)) + df_i["run"] = run_labels_list[i] + df_i["bin_order"], df_i["bin_label"] = zip(*df_i["distance_bin"].map(_distance_bin_order_and_label)) + df_i = df_i.sort_values("bin_order") + dfs_tpr.append(df_i) + df_tpr_dist = pd.concat(dfs_tpr, ignore_index=True) + ds_dlog("distance: df_tpr_dist total_rows=%s", len(df_tpr_dist)) + + dfs_fpr = [] + for i in range(len(runs)): + fc = build_filter_clause(filters_list[i], enable_dist_h=False) + q = sql_distance_bin_rates_from_eval_flat(_flat_view(i), fc, metrics="fpr") + ds_dlog("distance: compare run %s/%s FPR by bin query", i + 1, len(runs)) + df_i = con.execute(q).df() + ds_dlog("distance: compare FPR query run %s rows=%s", i, len(df_i)) + df_i["run"] = run_labels_list[i] + df_i["bin_order"], df_i["bin_label"] = zip(*df_i["distance_bin"].map(_distance_bin_order_and_label)) + df_i = df_i.sort_values("bin_order") + dfs_fpr.append(df_i) + df_fpr_dist = pd.concat(dfs_fpr, ignore_index=True) + + if not df_tpr_dist.empty: + rate_bin_labels_order = ( + df_tpr_dist[df_tpr_dist["run"] == run_labels_list[0]] .sort_values("bin_order")["bin_label"] .tolist() ) - - xaxis_oc = dict(tickangle=-35, categoryorder="array", categoryarray=align_x) - - if single_mode: - pivot_oc = df_oc.pivot_table( - index="bin_label", columns="label", values="n", aggfunc="sum", fill_value=0 - ) - pivot_oc = pivot_oc.reindex(align_x, fill_value=0) - - fig_oc = go.Figure() - if use_line_chart: - for j, lab in enumerate(pivot_oc.columns): - c = RUN_COLORS[j % len(RUN_COLORS)] + _xaxis_dist_bins = ( + dict(tickangle=-35, categoryorder="array", categoryarray=rate_bin_labels_order) + if rate_bin_labels_order + else dict(tickangle=-35) + ) + + if use_line_chart: + if not df_tpr_dist.empty: + fig_tpr = go.Figure() + for i, lbl in enumerate(run_labels_list): + d = df_tpr_dist[df_tpr_dist["run"] == lbl].sort_values("bin_order") + c = RUN_COLORS[i % len(RUN_COLORS)] r, g, b = int(c[1:3], 16), int(c[3:5], 16), int(c[5:7], 16) - nm = str(lab) - fig_oc.add_trace( + fig_tpr.add_trace( go.Scatter( - x=align_x, - y=pivot_oc[lab].values, - name=nm, + x=d["bin_label"], + y=d["tpr"], + name=lbl, mode="lines", line=dict(color=c, width=2.2, shape="spline"), fill="tozeroy", - fillcolor=f"rgba({r},{g},{b},0.12)", - hovertemplate=f"{nm}
%{{x}}
Count: %{{y:.0f}}", + fillcolor=f"rgba({r},{g},{b},0.15)", + hovertemplate=f"{lbl}
%{{x}}
TP rate: %{{y:.2%}}", ) ) + apply_chart_theme(fig_tpr, height=420) + fig_tpr.update_layout( + title=f"TP rate by distance", + xaxis_title="Distance bin", + yaxis_title="TP rate", + yaxis_range=[0, 1], + xaxis=_xaxis_dist_bins, + hovermode="x unified", + ) + fig_tpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + st.plotly_chart(fig_tpr, width='stretch') else: - for j, lab in enumerate(pivot_oc.columns): - c = RUN_COLORS[j % len(RUN_COLORS)] - nm = str(lab) - fig_oc.add_trace( - go.Bar( - x=align_x, - y=pivot_oc[lab].values, - name=nm, - marker_color=c, - hovertemplate=f"{nm}
%{{x}}
Count: %{{y:.0f}}", - ) - ) - apply_chart_theme(fig_oc, height=420) - fig_oc.update_layout( - title=f"Object count by distance bin (within {max_eval_range} m)", - xaxis_title="Distance bin", - yaxis_title="Count", - xaxis=xaxis_oc, - hovermode="x unified", - **({"barmode": "group"} if not use_line_chart else {}), - ) - st.plotly_chart(fig_oc, width='stretch') - else: - pivot_oc = df_oc.pivot_table( - index="bin_label", columns="run", values="n", aggfunc="sum", fill_value=0 - ) - pivot_oc = pivot_oc.reindex(align_x, fill_value=0) - run_cols = [r for r in run_labels_list if r in pivot_oc.columns] - - fig_oc = go.Figure() - if use_line_chart: - for j, rl in enumerate(run_cols): - c = RUN_COLORS[j % len(RUN_COLORS)] + st.info("No TP rate by distance data.") + + if not df_fpr_dist.empty: + fig_fpr = go.Figure() + for i, lbl in enumerate(run_labels_list): + d = df_fpr_dist[df_fpr_dist["run"] == lbl].sort_values("bin_order") + c = RUN_COLORS[i % len(RUN_COLORS)] r, g, b = int(c[1:3], 16), int(c[3:5], 16), int(c[5:7], 16) - fig_oc.add_trace( + fig_fpr.add_trace( go.Scatter( - x=align_x, - y=pivot_oc[rl].values, - name=str(rl), + x=d["bin_label"], + y=d["fpr"], + name=lbl, mode="lines", line=dict(color=c, width=2.2, shape="spline"), fill="tozeroy", fillcolor=f"rgba({r},{g},{b},0.15)", - hovertemplate=f"{rl}
%{{x}}
Count: %{{y:.0f}}", + hovertemplate=f"{lbl}
%{{x}}
FP rate: %{{y:.2%}}", ) ) + apply_chart_theme(fig_fpr, height=420) + fig_fpr.update_layout( + title=f"FP rate by distance", + xaxis_title="Distance bin", + yaxis_title="FP rate", + yaxis_range=[0, 1], + xaxis=_xaxis_dist_bins, + hovermode="x unified", + ) + fig_fpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + st.plotly_chart(fig_fpr, width='stretch') else: - for j, rl in enumerate(run_cols): - c = RUN_COLORS[j % len(RUN_COLORS)] - fig_oc.add_trace( + st.info("No FP rate by distance data.") + else: + # Bar chart (histogram) for compare: TP then FP, grouped by run + if not df_tpr_dist.empty: + fig_tpr = go.Figure() + for i, lbl in enumerate(run_labels_list): + d = df_tpr_dist[df_tpr_dist["run"] == lbl].sort_values("bin_order") + fig_tpr.add_trace( go.Bar( - x=align_x, - y=pivot_oc[rl].values, - name=str(rl), - marker_color=c, - hovertemplate=f"{rl}
%{{x}}
Count: %{{y:.0f}}", + x=d["bin_label"], + y=d["tpr"], + name=lbl, + marker_color=RUN_COLORS[i % len(RUN_COLORS)], + hovertemplate=f"{lbl}
%{{x}}
TP rate: %{{y:.2%}}", ) ) - apply_chart_theme(fig_oc, height=420) - fig_oc.update_layout( - title=f"Object count by distance bin", - xaxis_title="Distance bin", - yaxis_title="Count", - xaxis=xaxis_oc, - hovermode="x unified", - **({"barmode": "group"} if not use_line_chart else {}), - ) - st.plotly_chart(fig_oc, width='stretch') - except Exception as e_oc: - st.error(f"Error (object count by distance bin): {e_oc}") - -except Exception as e: - st.error(f"Error: {e}") -finally: - _dist_slot.empty() -# ============================= -# Panel 2: TP Rate (single) / TP Rate Comparison (compare) -# ============================= -st.markdown( - section_header_html( - "TP Rate" + (" Comparison" if not single_mode else ""), - "TP rate per object class (GT TP / (TP+FN)). Pick a chart style below.", - ), - unsafe_allow_html=True, -) - -_tpr_query = """ -SELECT - label, - CASE - WHEN COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) > 0 - THEN CAST(COUNT(*) FILTER (WHERE source='GT' AND status='TP') AS DOUBLE) - / COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) - ELSE 0 - END AS tpr -FROM {view} -WHERE {filter_clause} -GROUP BY label -ORDER BY label -""" - -# Compare-mode TP rate spider charts: several distance caps + no cap (sidebar range not used for this view) -TPR_COMPARE_SPIDER_RANGES: List[Tuple[Optional[int], str]] = [ - (50, "≤50 m"), - (80, "≤80 m"), - (100, "≤100 m"), - (120, "≤120 m"), - (150, "≤150 m"), - (None, "All distances"), -] - -if single_mode: - tpr_viz = st.radio( - "TP rate chart style", - options=["Bar chart", "Lollipop (ranked)"], - index=0, - horizontal=True, - key="tpr_viz_single", - ) - try: - with ds_spot_loading("TP rate"): - filter_clause = build_filter_clause(filters_base) - query = _tpr_query.format(view="view_eval_flat", filter_clause=filter_clause) - df_tpr_base = con.execute(query).df() - if not df_tpr_base.empty: - title = f"Total TP rate within {max_eval_range} [m]" - if tpr_viz == "Bar chart": - fig = px.bar( - df_tpr_base, - x="label", - y="tpr", - title=title, - labels={"tpr": "TP Rate", "label": "Label"}, - ) - apply_chart_theme(fig) - fig.update_layout(yaxis_range=[0, 1.2]) - fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") - st.plotly_chart(fig, width='stretch') + apply_chart_theme(fig_tpr, height=420) + fig_tpr.update_layout( + title=f"TP rate by distance", + xaxis_title="Distance bin", + yaxis_title="TP rate", + yaxis_range=[0, 1], + barmode="group", + xaxis=_xaxis_dist_bins, + hovermode="x unified", + ) + fig_tpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + st.plotly_chart(fig_tpr, width='stretch') + else: + st.info("No TP rate by distance data.") + + if not df_fpr_dist.empty: + fig_fpr = go.Figure() + for i, lbl in enumerate(run_labels_list): + d = df_fpr_dist[df_fpr_dist["run"] == lbl].sort_values("bin_order") + fig_fpr.add_trace( + go.Bar( + x=d["bin_label"], + y=d["fpr"], + name=lbl, + marker_color=RUN_COLORS[i % len(RUN_COLORS)], + hovertemplate=f"{lbl}
%{{x}}
FP rate: %{{y:.2%}}", + ) + ) + apply_chart_theme(fig_fpr, height=420) + fig_fpr.update_layout( + title=f"FP rate by distance", + xaxis_title="Distance bin", + yaxis_title="FP rate", + yaxis_range=[0, 1], + barmode="group", + xaxis=_xaxis_dist_bins, + hovermode="x unified", + ) + fig_fpr.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.25)") + st.plotly_chart(fig_fpr, width='stretch') + else: + st.info("No FP rate by distance data.") + + # Object count by same distance bins as TP/FP; same line vs bar style; aligned x-axis + + try: + if single_mode: + q_oc = f""" + SELECT ({_DIST_BIN_CASE}) AS distance_bin, label, COUNT(*) AS n + FROM view_eval_flat + WHERE {filter_clause_base} + GROUP BY 1, 2 + """ + df_oc = con.execute(q_oc).df() else: - fig = _tpr_lollipop_single(df_tpr_base, title) - st.plotly_chart(fig, width='stretch') - else: - st.info("No data available") - except Exception as e: - st.error(f"Error: {e}") -else: - tpr_opts = ["Spider chart", "Grouped bar", "Heatmap (label × run)", "Line profile"] - tpr_viz = st.radio( - "TP rate chart style", - options=tpr_opts, - index=0, - horizontal=True, - key="tpr_viz_compare", - ) - try: - with ds_spot_loading("TP rate"): - dfs_tpr = [] - for i in range(len(runs)): - fc = build_filter_clause(filters_list[i]) - q = _tpr_query.format(view=_flat_view(i), filter_clause=fc) - df_i = con.execute(q).df() - df_i["run"] = run_labels_list[i] - dfs_tpr.append(df_i) - df_tpr_all = pd.concat(dfs_tpr, ignore_index=True) - if tpr_viz == "Spider chart": - st.caption( - "Six spider charts use **fixed distance cutoffs** (50–150 m) plus **all distances**. " - "Topic / label / suite / visibility filters still apply. " - "Other chart types and the rest of the page use the sidebar **Max Evaluation Range**." - ) - fb_all = {**filters_base, "max_eval_range": None} - label_union: set = set() - for i in range(len(runs)): - fc_a = build_filter_clause(fb_all) - q_a = _tpr_query.format(view=_flat_view(i), filter_clause=fc_a) - dfa = con.execute(q_a).df() - label_union |= set(dfa["label"].astype(str)) - cats = sorted(label_union) - if not cats: - st.info("No TP rate data for any distance range with current filters.") + dfs_oc = [] + for i in range(len(runs)): + fc_oc = build_filter_clause(filters_list[i], enable_dist_h=False) + q_oc_i = f""" + SELECT ({_DIST_BIN_CASE}) AS distance_bin, COUNT(*) AS n + FROM {_flat_view(i)} + WHERE {fc_oc} + GROUP BY 1 + """ + df_oci = con.execute(q_oc_i).df() + df_oci["run"] = run_labels_list[i] + dfs_oc.append(df_oci) + df_oc = pd.concat(dfs_oc, ignore_index=True) + + if df_oc.empty: + st.info("No object count data by distance bin.") else: - if len(cats) > 16: - st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") - for row_start in range(0, len(TPR_COMPARE_SPIDER_RANGES), 3): - row_ranges = TPR_COMPARE_SPIDER_RANGES[row_start : row_start + 3] - cols = st.columns(len(row_ranges)) - for col, (max_r, cap_lbl) in zip(cols, row_ranges): - fb = {**filters_base, "max_eval_range": max_r} - dfs_slice = [] - for i in range(len(runs)): - fc = build_filter_clause(fb) - q = _tpr_query.format(view=_flat_view(i), filter_clause=fc) - dfi = con.execute(q).df() - dfi["run"] = run_labels_list[i] - dfs_slice.append(dfi) - df_slice = pd.concat(dfs_slice, ignore_index=True) - with col: - if df_slice.empty: - st.info(f"No data ({cap_lbl}).") - else: - fig = _tpr_spider_compare( - df_slice, - cats, - f"TP rate ({cap_lbl})", - run_labels_list, - height=360, + df_oc = df_oc.copy() + df_oc["bin_order"], df_oc["bin_label"] = zip(*df_oc["distance_bin"].map(_distance_bin_order_and_label)) + if rate_bin_labels_order: + align_x = list(rate_bin_labels_order) + else: + align_x = ( + df_oc.drop_duplicates("distance_bin") + .sort_values("bin_order")["bin_label"] + .tolist() + ) + + xaxis_oc = dict(tickangle=-35, categoryorder="array", categoryarray=align_x) + + if single_mode: + pivot_oc = df_oc.pivot_table( + index="bin_label", columns="label", values="n", aggfunc="sum", fill_value=0 + ) + pivot_oc = pivot_oc.reindex(align_x, fill_value=0) + + fig_oc = go.Figure() + if use_line_chart: + for j, lab in enumerate(pivot_oc.columns): + c = RUN_COLORS[j % len(RUN_COLORS)] + r, g, b = int(c[1:3], 16), int(c[3:5], 16), int(c[5:7], 16) + nm = str(lab) + fig_oc.add_trace( + go.Scatter( + x=align_x, + y=pivot_oc[lab].values, + name=nm, + mode="lines", + line=dict(color=c, width=2.2, shape="spline"), + fill="tozeroy", + fillcolor=f"rgba({r},{g},{b},0.12)", + hovertemplate=f"{nm}
%{{x}}
Count: %{{y:.0f}}", ) - st.plotly_chart(fig, width='stretch') - elif not df_tpr_all.empty: - title = f"Total TP rate within {max_eval_range} [m] by run" - if tpr_viz == "Grouped bar": - fig = px.bar( - df_tpr_all, - x="label", - y="tpr", - color="run", - barmode="group", - title=title, - labels={"tpr": "TP Rate", "label": "Label", "run": "Run"}, - color_discrete_sequence=RUN_COLORS, - ) - apply_chart_theme(fig) - fig.update_layout(yaxis_range=[0, 1.2]) - fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") - st.plotly_chart(fig, width='stretch') - elif tpr_viz == "Heatmap (label × run)": - pivot = df_tpr_all.pivot_table(index="label", columns="run", values="tpr", aggfunc="first") - cols_present = [c for c in run_labels_list if c in pivot.columns] - if cols_present: - pivot = pivot[cols_present] - fig = px.imshow( - pivot, - labels=dict(x="Run", y="Label", color="TP rate"), - title=title, - color_continuous_scale="RdYlGn", - zmin=0, - zmax=1, - aspect="auto", - ) - apply_chart_theme(fig, height=max(360, 32 + 22 * len(pivot.index))) - fig.update_layout(xaxis_side="top") - st.plotly_chart(fig, width='stretch') - elif tpr_viz == "Line profile": - fig = px.line( - df_tpr_all, - x="label", - y="tpr", - color="run", - markers=True, - title=title, - labels={"tpr": "TP Rate", "label": "Label", "run": "Run"}, - color_discrete_sequence=RUN_COLORS, - ) - fig.update_traces(line=dict(width=2.5), marker=dict(size=8)) - apply_chart_theme(fig, height=400) - fig.update_layout(yaxis_range=[0, 1.15], xaxis_tickangle=-35, hovermode="x unified") - fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") - st.plotly_chart(fig, width='stretch') - else: - st.info("No data available") + ) + else: + for j, lab in enumerate(pivot_oc.columns): + c = RUN_COLORS[j % len(RUN_COLORS)] + nm = str(lab) + fig_oc.add_trace( + go.Bar( + x=align_x, + y=pivot_oc[lab].values, + name=nm, + marker_color=c, + hovertemplate=f"{nm}
%{{x}}
Count: %{{y:.0f}}", + ) + ) + apply_chart_theme(fig_oc, height=420) + fig_oc.update_layout( + title=f"Object count by distance bin (within {max_eval_range} m)", + xaxis_title="Distance bin", + yaxis_title="Count", + xaxis=xaxis_oc, + hovermode="x unified", + **({"barmode": "group"} if not use_line_chart else {}), + ) + st.plotly_chart(fig_oc, width='stretch') + else: + pivot_oc = df_oc.pivot_table( + index="bin_label", columns="run", values="n", aggfunc="sum", fill_value=0 + ) + pivot_oc = pivot_oc.reindex(align_x, fill_value=0) + run_cols = [r for r in run_labels_list if r in pivot_oc.columns] + + fig_oc = go.Figure() + if use_line_chart: + for j, rl in enumerate(run_cols): + c = RUN_COLORS[j % len(RUN_COLORS)] + r, g, b = int(c[1:3], 16), int(c[3:5], 16), int(c[5:7], 16) + fig_oc.add_trace( + go.Scatter( + x=align_x, + y=pivot_oc[rl].values, + name=str(rl), + mode="lines", + line=dict(color=c, width=2.2, shape="spline"), + fill="tozeroy", + fillcolor=f"rgba({r},{g},{b},0.15)", + hovertemplate=f"{rl}
%{{x}}
Count: %{{y:.0f}}", + ) + ) + else: + for j, rl in enumerate(run_cols): + c = RUN_COLORS[j % len(RUN_COLORS)] + fig_oc.add_trace( + go.Bar( + x=align_x, + y=pivot_oc[rl].values, + name=str(rl), + marker_color=c, + hovertemplate=f"{rl}
%{{x}}
Count: %{{y:.0f}}", + ) + ) + apply_chart_theme(fig_oc, height=420) + fig_oc.update_layout( + title=f"Object count by distance bin", + xaxis_title="Distance bin", + yaxis_title="Count", + xaxis=xaxis_oc, + hovermode="x unified", + **({"barmode": "group"} if not use_line_chart else {}), + ) + st.plotly_chart(fig_oc, width='stretch') + except Exception as e_oc: + st.error(f"Error (object count by distance bin): {e_oc}") + except Exception as e: st.error(f"Error: {e}") -# ============================= -# Panel 5: Perception diff vs baseline A (compare mode only) -# ============================= -def _baobab_hierarchy_from_objects( - df_obj: pd.DataFrame, - change_type: str, - root_label: str, - max_scenarios: int, - max_frames: int, -) -> pd.DataFrame: - """ - Build a leaf table for Plotly sunburst/treemap: root → scenario → frame → label. - Caps scenarios and frames per scenario; merges the rest into Other buckets. - """ - if df_obj.empty or "change_type" not in df_obj.columns: - return pd.DataFrame() - sub = df_obj[df_obj["change_type"] == change_type].copy() - if sub.empty: - return pd.DataFrame() - sub["scenario_name"] = sub["scenario_name"].fillna("").astype(str).replace("", "(no scenario)") - sub["label"] = sub["label"].fillna("").astype(str).replace("", "(no label)") - sub["frame_key"] = ( - sub["t4dataset_id"].astype(str) + "|f" + sub["frame_index"].astype(str) - ) - leaf = ( - sub.groupby(["scenario_name", "frame_key", "label"], dropna=False) - .size() - .reset_index(name="n") - ) - if leaf.empty: - return pd.DataFrame() - ms = max(int(max_scenarios), 1) - mf = max(int(max_frames), 1) - scen_tot = leaf.groupby("scenario_name")["n"].sum().sort_values(ascending=False) - top_scen = set(scen_tot.head(ms).index) - leaf["scen_g"] = np.where( - leaf["scenario_name"].isin(top_scen), - leaf["scenario_name"], - "Other scenarios", - ) - parts = [] - for _, g in leaf.groupby("scen_g"): - fr_tot = g.groupby("frame_key")["n"].sum().sort_values(ascending=False) - top_fr = set(fr_tot.head(mf).index) - g2 = g.copy() - g2["fr_g"] = np.where(g2["frame_key"].isin(top_fr), g2["frame_key"], "Other frames") - agg = g2.groupby(["scen_g", "fr_g", "label"], as_index=False)["n"].sum() - parts.append(agg) - out = pd.concat(parts, ignore_index=True) - out["root"] = root_label - - def _frame_ring_label(fr_g: str, scen_g: str) -> str: - if fr_g == "Other frames" or str(fr_g) == "Other frames": - return "Other frames" - sfg = str(fr_g) - if "|f" not in sfg: - return sfg - fid = sfg.split("|f", 1)[-1] - if scen_g == "Other scenarios": - t4 = sfg.split("|f", 1)[0] - t4s = t4 if len(t4) <= 14 else ("…" + t4[-12:]) - return f"{t4s}|f{fid}" - return f"f{fid}" - - out["fr_display"] = out.apply( - lambda r: _frame_ring_label(r["fr_g"], r["scen_g"]), axis=1 - ) - return out - - -def _comparison_lens_treemap_df( - names: pd.Series, - improved: pd.Series, - degraded: pd.Series, - root_title: str, -) -> pd.DataFrame: - """Rows for px.treemap path root → Improved|Degraded → item (area = n).""" - rows = [] - for i in range(len(names)): - nm = str(names.iloc[i]).strip() or "—" - if len(nm) > 72: - nm = nm[:69] + "…" - ip = float(improved.iloc[i]) if pd.notna(improved.iloc[i]) else 0.0 - dg = float(degraded.iloc[i]) if pd.notna(degraded.iloc[i]) else 0.0 - if ip > 0: - rows.append( - {"root": root_title, "side": "Improved", "item": nm, "n": ip} - ) - if dg > 0: - rows.append( - {"root": root_title, "side": "Degraded", "item": nm, "n": dg} - ) - return pd.DataFrame(rows) - - -def _plot_comparison_lens_treemap( - tdf: pd.DataFrame, - st_key: str, - title: str, -) -> None: - if tdf is None or tdf.empty: - st.caption("_No data for this view._") - return - fig = px.treemap( - tdf, - path=["root", "side", "item"], - values="n", - color="side", - color_discrete_map={"Improved": IMPROVED_COLOR, "Degraded": DEGRADED_COLOR}, - ) - fig.update_traces( - textfont_size=12, - textinfo="label+value+percent parent", - hovertemplate=( - "%{label}
" - "GT objects: %{value:.0f}
" - "% of parent: %{percentParent}" - ), - marker_line_width=1.5, - marker_line_color="rgba(255,255,255,0.45)", - root_color="rgba(240,240,245,0.95)", - ) - _title_layout = {**PLOTLY_LAYOUT_THEME["title"], "text": title} - apply_chart_theme( - fig, - height=430, - margin=dict(t=20, l=2, r=2, b=2), - paper_bgcolor="rgba(0,0,0,0)", - title=_title_layout, - ) - st.plotly_chart(fig, width='stretch', key=st_key) - - -if not single_mode: - st.divider() + finally: + _dist_slot.empty() + ds_dlog("section: Panel3_5_Distance_end") + # ============================= + # Panel 2: TP Rate (single) / TP Rate Comparison (compare) + # ============================= + ds_dlog("section: Panel2_TP_Rate_start") st.markdown( section_header_html( - "Perception diff (vs baseline A)", - "Per-GT-object comparison vs baseline A: degraded = was TP on A and FN on candidate; improved = was FN on A and TP on candidate. Hotspots prioritize regressions.", + "TP Rate" + (" Comparison" if not single_mode else ""), + "TP rate per object class (GT TP / (TP+FN)). Pick a chart style below.", ), unsafe_allow_html=True, ) - for idx in range(1, len(runs)): - lbl = run_labels_list[idx] - _pd_slot = st.empty() - _pd_slot.markdown(ds_spot_loading_markup(f"Perception diff · run {lbl}"), unsafe_allow_html=True) + + _tpr_query = """ + SELECT + label, + CASE + WHEN COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) > 0 + THEN CAST(COUNT(*) FILTER (WHERE source='GT' AND status='TP') AS DOUBLE) + / COUNT(*) FILTER (WHERE source='GT' AND status IN ('TP','FN')) + ELSE 0 + END AS tpr + FROM {view} + WHERE {filter_clause} + GROUP BY label + ORDER BY label + """ + + # Compare-mode TP rate spider charts: several distance caps + no cap (sidebar range not used for this view) + TPR_COMPARE_SPIDER_RANGES: List[Tuple[Optional[int], str]] = [ + (50, "≤50 m"), + (80, "≤80 m"), + (100, "≤100 m"), + (120, "≤120 m"), + (150, "≤150 m"), + (None, "All distances"), + ] + + if single_mode: + tpr_viz = st.radio( + "TP rate chart style", + options=["Bar chart", "Lollipop (ranked)"], + index=0, + horizontal=True, + key="tpr_viz_single", + ) try: - filter_clause_comp_p5 = build_filter_clause(filters_list[idx], enable_dist_h=False) - comp_flat = _flat_view(idx) - query = f""" - WITH base_gt AS ( - SELECT - t4dataset_id, - frame_index, - uuid AS gt_uuid, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, - COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name - FROM view_eval_flat - WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL - AND {filter_clause_base} - GROUP BY 1,2,3 + with ds_spot_loading("TP rate"): + filter_clause = build_filter_clause(filters_base) + query = _tpr_query.format(view="view_eval_flat", filter_clause=filter_clause) + df_tpr_base = con.execute(query).df() + if not df_tpr_base.empty: + title = f"Total TP rate within {max_eval_range} [m]" + if tpr_viz == "Bar chart": + fig = px.bar( + df_tpr_base, + x="label", + y="tpr", + title=title, + labels={"tpr": "TP Rate", "label": "Label"}, + ) + apply_chart_theme(fig) + fig.update_layout(yaxis_range=[0, 1.2]) + fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") + st.plotly_chart(fig, width='stretch') + else: + fig = _tpr_lollipop_single(df_tpr_base, title) + st.plotly_chart(fig, width='stretch') + else: + st.info("No data available") + except Exception as e: + st.error(f"Error: {e}") + else: + tpr_opts = ["Spider chart", "Grouped bar", "Heatmap (label × run)", "Line profile"] + tpr_viz = st.radio( + "TP rate chart style", + options=tpr_opts, + index=0, + horizontal=True, + key="tpr_viz_compare", + ) + try: + with ds_spot_loading("TP rate"): + dfs_tpr = [] + for i in range(len(runs)): + fc = build_filter_clause(filters_list[i]) + q = _tpr_query.format(view=_flat_view(i), filter_clause=fc) + df_i = con.execute(q).df() + df_i["run"] = run_labels_list[i] + dfs_tpr.append(df_i) + df_tpr_all = pd.concat(dfs_tpr, ignore_index=True) + if tpr_viz == "Spider chart": + st.caption( + "Six spider charts use **fixed distance cutoffs** (50–150 m) plus **all distances**. " + "Topic / label / suite / visibility filters still apply. " + "Other chart types and the rest of the page use the sidebar **Max Evaluation Range**." + ) + fb_all = {**filters_base, "max_eval_range": None} + label_union: set = set() + for i in range(len(runs)): + fc_a = build_filter_clause(fb_all) + q_a = _tpr_query.format(view=_flat_view(i), filter_clause=fc_a) + dfa = con.execute(q_a).df() + label_union |= set(dfa["label"].astype(str)) + cats = sorted(label_union) + if not cats: + st.info("No TP rate data for any distance range with current filters.") + else: + if len(cats) > 16: + st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") + for row_start in range(0, len(TPR_COMPARE_SPIDER_RANGES), 3): + row_ranges = TPR_COMPARE_SPIDER_RANGES[row_start : row_start + 3] + cols = st.columns(len(row_ranges)) + for col, (max_r, cap_lbl) in zip(cols, row_ranges): + fb = {**filters_base, "max_eval_range": max_r} + dfs_slice = [] + for i in range(len(runs)): + fc = build_filter_clause(fb) + q = _tpr_query.format(view=_flat_view(i), filter_clause=fc) + dfi = con.execute(q).df() + dfi["run"] = run_labels_list[i] + dfs_slice.append(dfi) + df_slice = pd.concat(dfs_slice, ignore_index=True) + with col: + if df_slice.empty: + st.info(f"No data ({cap_lbl}).") + else: + fig = _tpr_spider_compare( + df_slice, + cats, + f"TP rate ({cap_lbl})", + run_labels_list, + height=360, + ) + st.plotly_chart(fig, width='stretch') + elif not df_tpr_all.empty: + title = f"Total TP rate within {max_eval_range} [m] by run" + if tpr_viz == "Grouped bar": + fig = px.bar( + df_tpr_all, + x="label", + y="tpr", + color="run", + barmode="group", + title=title, + labels={"tpr": "TP Rate", "label": "Label", "run": "Run"}, + color_discrete_sequence=RUN_COLORS, + ) + apply_chart_theme(fig) + fig.update_layout(yaxis_range=[0, 1.2]) + fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") + st.plotly_chart(fig, width='stretch') + elif tpr_viz == "Heatmap (label × run)": + pivot = df_tpr_all.pivot_table(index="label", columns="run", values="tpr", aggfunc="first") + cols_present = [c for c in run_labels_list if c in pivot.columns] + if cols_present: + pivot = pivot[cols_present] + fig = px.imshow( + pivot, + labels=dict(x="Run", y="Label", color="TP rate"), + title=title, + color_continuous_scale="RdYlGn", + zmin=0, + zmax=1, + aspect="auto", + ) + apply_chart_theme(fig, height=max(360, 32 + 22 * len(pivot.index))) + fig.update_layout(xaxis_side="top") + st.plotly_chart(fig, width='stretch') + elif tpr_viz == "Line profile": + fig = px.line( + df_tpr_all, + x="label", + y="tpr", + color="run", + markers=True, + title=title, + labels={"tpr": "TP Rate", "label": "Label", "run": "Run"}, + color_discrete_sequence=RUN_COLORS, + ) + fig.update_traces(line=dict(width=2.5), marker=dict(size=8)) + apply_chart_theme(fig, height=400) + fig.update_layout(yaxis_range=[0, 1.15], xaxis_tickangle=-35, hovermode="x unified") + fig.add_hline(y=0.5, line_dash="dash", line_color="rgba(0,0,0,0.2)") + st.plotly_chart(fig, width='stretch') + else: + st.info("No data available") + except Exception as e: + st.error(f"Error: {e}") + # ============================= + # Panel 5: Perception diff vs baseline A (compare mode only) + # ============================= + def _baobab_hierarchy_from_objects( + df_obj: pd.DataFrame, + change_type: str, + root_label: str, + max_scenarios: int, + max_frames: int, + ) -> pd.DataFrame: + """ + Build a leaf table for Plotly sunburst/treemap: root → scenario → frame → label. + Caps scenarios and frames per scenario; merges the rest into Other buckets. + """ + if df_obj.empty or "change_type" not in df_obj.columns: + return pd.DataFrame() + sub = df_obj[df_obj["change_type"] == change_type].copy() + if sub.empty: + return pd.DataFrame() + sub["scenario_name"] = sub["scenario_name"].fillna("").astype(str).replace("", "(no scenario)") + sub["label"] = sub["label"].fillna("").astype(str).replace("", "(no label)") + sub["frame_key"] = ( + sub["t4dataset_id"].astype(str) + "|f" + sub["frame_index"].astype(str) + ) + leaf = ( + sub.groupby(["scenario_name", "frame_key", "label"], dropna=False) + .size() + .reset_index(name="n") + ) + if leaf.empty: + return pd.DataFrame() + ms = max(int(max_scenarios), 1) + mf = max(int(max_frames), 1) + scen_tot = leaf.groupby("scenario_name")["n"].sum().sort_values(ascending=False) + top_scen = set(scen_tot.head(ms).index) + leaf["scen_g"] = np.where( + leaf["scenario_name"].isin(top_scen), + leaf["scenario_name"], + "Other scenarios", + ) + parts = [] + for _, g in leaf.groupby("scen_g"): + fr_tot = g.groupby("frame_key")["n"].sum().sort_values(ascending=False) + top_fr = set(fr_tot.head(mf).index) + g2 = g.copy() + g2["fr_g"] = np.where(g2["frame_key"].isin(top_fr), g2["frame_key"], "Other frames") + agg = g2.groupby(["scen_g", "fr_g", "label"], as_index=False)["n"].sum() + parts.append(agg) + out = pd.concat(parts, ignore_index=True) + out["root"] = root_label + + def _frame_ring_label(fr_g: str, scen_g: str) -> str: + if fr_g == "Other frames" or str(fr_g) == "Other frames": + return "Other frames" + sfg = str(fr_g) + if "|f" not in sfg: + return sfg + fid = sfg.split("|f", 1)[-1] + if scen_g == "Other scenarios": + t4 = sfg.split("|f", 1)[0] + t4s = t4 if len(t4) <= 14 else ("…" + t4[-12:]) + return f"{t4s}|f{fid}" + return f"f{fid}" + + out["fr_display"] = out.apply( + lambda r: _frame_ring_label(r["fr_g"], r["scen_g"]), axis=1 + ) + return out + + + def _comparison_lens_treemap_df( + names: pd.Series, + improved: pd.Series, + degraded: pd.Series, + root_title: str, + ) -> pd.DataFrame: + """Rows for px.treemap path root → Improved|Degraded → item (area = n).""" + rows = [] + for i in range(len(names)): + nm = str(names.iloc[i]).strip() or "—" + if len(nm) > 72: + nm = nm[:69] + "…" + ip = float(improved.iloc[i]) if pd.notna(improved.iloc[i]) else 0.0 + dg = float(degraded.iloc[i]) if pd.notna(degraded.iloc[i]) else 0.0 + if ip > 0: + rows.append( + {"root": root_title, "side": "Improved", "item": nm, "n": ip} + ) + if dg > 0: + rows.append( + {"root": root_title, "side": "Degraded", "item": nm, "n": dg} + ) + return pd.DataFrame(rows) + + + def _plot_comparison_lens_treemap( + tdf: pd.DataFrame, + st_key: str, + title: str, + ) -> None: + if tdf is None or tdf.empty: + st.caption("_No data for this view._") + return + fig = px.treemap( + tdf, + path=["root", "side", "item"], + values="n", + color="side", + color_discrete_map={"Improved": IMPROVED_COLOR, "Degraded": DEGRADED_COLOR}, + ) + fig.update_traces( + textfont_size=12, + textinfo="label+value+percent parent", + hovertemplate=( + "%{label}
" + "GT objects: %{value:.0f}
" + "% of parent: %{percentParent}" ), - comp_gt AS ( - SELECT - t4dataset_id, - frame_index, - uuid AS gt_uuid, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, - COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name - FROM {comp_flat} - WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL - AND {filter_clause_comp_p5} - GROUP BY 1,2,3 + marker_line_width=1.5, + marker_line_color="rgba(255,255,255,0.45)", + root_color="rgba(240,240,245,0.95)", + ) + _title_layout = {**PLOTLY_LAYOUT_THEME["title"], "text": title} + apply_chart_theme( + fig, + height=430, + margin=dict(t=20, l=2, r=2, b=2), + paper_bgcolor="rgba(0,0,0,0)", + title=_title_layout, + ) + st.plotly_chart(fig, width='stretch', key=st_key) + + + if not single_mode: + ds_dlog("section: Perception_diff_start") + st.divider() + st.markdown( + section_header_html( + "Perception diff (vs baseline A)", + "Per-GT-object comparison vs baseline A: degraded = was TP on A and FN on candidate; improved = was FN on A and TP on candidate. Hotspots prioritize regressions.", ), - joined AS ( + unsafe_allow_html=True, + ) + for idx in range(1, len(runs)): + lbl = run_labels_list[idx] + _pd_slot = st.empty() + _pd_slot.markdown(ds_spot_loading_markup(f"Perception diff · run {lbl}"), unsafe_allow_html=True) + try: + filter_clause_comp_p5 = build_filter_clause(filters_list[idx], enable_dist_h=False) + comp_flat = _flat_view(idx) + query = f""" + WITH base_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM view_eval_flat + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + AND {filter_clause_base} + GROUP BY 1,2,3 + ), + comp_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM {comp_flat} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + AND {filter_clause_comp_p5} + GROUP BY 1,2,3 + ), + joined AS ( + SELECT + COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, + COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, + COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, + COALESCE(b.tp_base, FALSE) AS tp_base, + COALESCE(c.tp_comp, FALSE) AS tp_comp, + COALESCE(b.suite_name, c.suite_name, '') AS suite_name, + COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, + COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name + FROM base_gt b + FULL OUTER JOIN comp_gt c + ON b.t4dataset_id = c.t4dataset_id + AND b.frame_index = c.frame_index + AND b.gt_uuid = c.gt_uuid + ) SELECT - COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, - COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, - COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, - COALESCE(b.tp_base, FALSE) AS tp_base, - COALESCE(c.tp_comp, FALSE) AS tp_comp, - COALESCE(b.suite_name, c.suite_name, '') AS suite_name, - COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, - COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name - FROM base_gt b - FULL OUTER JOIN comp_gt c - ON b.t4dataset_id = c.t4dataset_id - AND b.frame_index = c.frame_index - AND b.gt_uuid = c.gt_uuid - ) - SELECT - t4dataset_id, - CAST(COUNT(*) FILTER (WHERE TRUE) AS DOUBLE) AS total_gt, - CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, - CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt, - CAST(COUNT(*) FILTER (WHERE tp_base AND tp_comp) AS DOUBLE) AS both_tp_cnt, - CAST(COUNT(*) FILTER (WHERE NOT tp_base AND NOT tp_comp) AS DOUBLE) AS both_fn_cnt, - CAST(SUM((CASE WHEN tp_comp THEN 1 ELSE 0 END) - (CASE WHEN tp_base THEN 1 ELSE 0 END)) AS DOUBLE) AS net_tp_delta, - suite_name, - scenario_name, - t4dataset_name - FROM joined - GROUP BY t4dataset_id, suite_name, scenario_name, t4dataset_name - ORDER BY net_tp_delta DESC - """ - df_improved = con.execute(query).df() - if not df_improved.empty: - query_frame_p5 = f""" - WITH base_gt AS ( - SELECT - t4dataset_id, - frame_index, - uuid AS gt_uuid, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, - COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name - FROM view_eval_flat - WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL - AND {filter_clause_base} - GROUP BY 1, 2, 3 - ), - comp_gt AS ( + t4dataset_id, + CAST(COUNT(*) FILTER (WHERE TRUE) AS DOUBLE) AS total_gt, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND tp_comp) AS DOUBLE) AS both_tp_cnt, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND NOT tp_comp) AS DOUBLE) AS both_fn_cnt, + CAST(SUM((CASE WHEN tp_comp THEN 1 ELSE 0 END) - (CASE WHEN tp_base THEN 1 ELSE 0 END)) AS DOUBLE) AS net_tp_delta, + suite_name, + scenario_name, + t4dataset_name + FROM joined + GROUP BY t4dataset_id, suite_name, scenario_name, t4dataset_name + ORDER BY net_tp_delta DESC + """ + df_improved = con.execute(query).df() + if not df_improved.empty: + query_frame_p5 = f""" + WITH base_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM view_eval_flat + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + AND {filter_clause_base} + GROUP BY 1, 2, 3 + ), + comp_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM {comp_flat} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + AND {filter_clause_comp_p5} + GROUP BY 1, 2, 3 + ), + joined AS ( + SELECT + COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, + COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, + COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, + COALESCE(b.tp_base, FALSE) AS tp_base, + COALESCE(c.tp_comp, FALSE) AS tp_comp, + COALESCE(b.suite_name, c.suite_name, '') AS suite_name, + COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, + COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name + FROM base_gt b + FULL OUTER JOIN comp_gt c + ON b.t4dataset_id = c.t4dataset_id + AND b.frame_index = c.frame_index + AND b.gt_uuid = c.gt_uuid + ) SELECT t4dataset_id, frame_index, - uuid AS gt_uuid, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, - COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name - FROM {comp_flat} - WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL - AND {filter_clause_comp_p5} - GROUP BY 1, 2, 3 - ), - joined AS ( + scenario_name, + suite_name, + t4dataset_name, + CAST(COUNT(*) FILTER (WHERE TRUE) AS DOUBLE) AS total_gt, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND tp_comp) AS DOUBLE) AS both_tp_cnt, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND NOT tp_comp) AS DOUBLE) AS both_fn_cnt, + CAST(SUM((CASE WHEN tp_comp THEN 1 ELSE 0 END) - (CASE WHEN tp_base THEN 1 ELSE 0 END)) AS DOUBLE) AS net_tp_delta + FROM joined + GROUP BY t4dataset_id, frame_index, suite_name, scenario_name, t4dataset_name + ORDER BY net_tp_delta DESC + """ + query_object_p5 = f""" + WITH base_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM view_eval_flat + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + AND {filter_clause_base} + GROUP BY 1, 2, 3 + ), + comp_gt AS ( + SELECT + t4dataset_id, + frame_index, + uuid AS gt_uuid, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, + COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + FROM {comp_flat} + WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL + AND {filter_clause_comp_p5} + GROUP BY 1, 2, 3 + ), + joined AS ( + SELECT + COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, + COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, + COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, + COALESCE(b.tp_base, FALSE) AS tp_base, + COALESCE(c.tp_comp, FALSE) AS tp_comp, + COALESCE(b.suite_name, c.suite_name, '') AS suite_name, + COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, + COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name + FROM base_gt b + FULL OUTER JOIN comp_gt c + ON b.t4dataset_id = c.t4dataset_id + AND b.frame_index = c.frame_index + AND b.gt_uuid = c.gt_uuid + ), + obj_attrs AS ( + SELECT + t4dataset_id, + frame_index, + uuid, + MAX(CAST(label AS VARCHAR)) AS label, + MAX(dist_h) AS dist_h + FROM view_eval_flat + WHERE source = 'GT' + GROUP BY 1, 2, 3 + ) SELECT - COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, - COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, - COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, - COALESCE(b.tp_base, FALSE) AS tp_base, - COALESCE(c.tp_comp, FALSE) AS tp_comp, - COALESCE(b.suite_name, c.suite_name, '') AS suite_name, - COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, - COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name - FROM base_gt b - FULL OUTER JOIN comp_gt c - ON b.t4dataset_id = c.t4dataset_id - AND b.frame_index = c.frame_index - AND b.gt_uuid = c.gt_uuid + j.t4dataset_id, + j.frame_index, + j.gt_uuid, + COALESCE(e.label, '') AS label, + COALESCE(e.dist_h, 0.0) AS dist_h, + {_DIST_BIN_CASE.replace("dist_h", "COALESCE(e.dist_h, 0.0)")} AS distance_bin, + j.suite_name, + j.scenario_name, + j.t4dataset_name, + CASE + WHEN NOT j.tp_base AND j.tp_comp THEN 'improved' + WHEN j.tp_base AND NOT j.tp_comp THEN 'degraded' + WHEN j.tp_base AND j.tp_comp THEN 'both_tp' + ELSE 'both_fn' + END AS change_type, + j.tp_base, + j.tp_comp + FROM joined j + LEFT JOIN obj_attrs e + ON CAST(j.t4dataset_id AS VARCHAR) = CAST(e.t4dataset_id AS VARCHAR) + AND j.frame_index = CAST(e.frame_index AS VARCHAR) + AND j.gt_uuid = e.uuid + ORDER BY change_type, j.t4dataset_id, j.frame_index + """ + try: + df_by_frame = con.execute(query_frame_p5).df() + except Exception: + df_by_frame = pd.DataFrame() + try: + df_by_object_full = con.execute(query_object_p5).df() + except Exception: + df_by_object_full = pd.DataFrame() + + tot_imp = float(df_improved["improved_cnt"].sum()) + tot_deg = float(df_improved["degraded_cnt"].sum()) + tot_net = tot_imp - tot_deg + net_s = f"+{int(tot_net)}" if tot_net > 0 else str(int(tot_net)) + + with st.expander(f"Run {lbl} vs A", expanded=(len(runs) == 2)): + c1, c2, c3, c4 = st.columns(4) + c1.metric("Improved (FN→TP)", int(tot_imp)) + c2.metric("Degraded (TP→FN)", int(tot_deg)) + c3.metric("Net TP delta", net_s) + c4.caption("Start with scenarios and frames with the most **degraded** counts.") + st.markdown( + f"**Summary:** Net **{net_s}** TP vs baseline A — " + f"**{int(tot_deg)}** degraded vs **{int(tot_imp)}** improved." ) - SELECT - t4dataset_id, - frame_index, - scenario_name, - suite_name, - t4dataset_name, - CAST(COUNT(*) FILTER (WHERE TRUE) AS DOUBLE) AS total_gt, - CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, - CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt, - CAST(COUNT(*) FILTER (WHERE tp_base AND tp_comp) AS DOUBLE) AS both_tp_cnt, - CAST(COUNT(*) FILTER (WHERE NOT tp_base AND NOT tp_comp) AS DOUBLE) AS both_fn_cnt, - CAST(SUM((CASE WHEN tp_comp THEN 1 ELSE 0 END) - (CASE WHEN tp_base THEN 1 ELSE 0 END)) AS DOUBLE) AS net_tp_delta - FROM joined - GROUP BY t4dataset_id, frame_index, suite_name, scenario_name, t4dataset_name - ORDER BY net_tp_delta DESC - """ - query_object_p5 = f""" + + b_key = f"p5_baobab_{lbl}_{idx}" + c1b, c2b, c3b = st.columns([1, 1, 1]) + with c1b: + baobab_viz = st.radio( + "Chart type", + ["Sunburst", "Treemap"], + horizontal=True, + key=f"{b_key}_viz", + ) + with c2b: + baobab_ns = st.slider( + "Max scenarios", + min_value=5, + max_value=25, + value=15, + key=f"{b_key}_ns", + ) + with c3b: + baobab_nf = st.slider( + "Max frames / scenario", + min_value=5, + max_value=20, + value=10, + key=f"{b_key}_nf", + ) + if df_by_object_full.empty: + st.caption("No object-level rows for hierarchy.") + else: + path_cols = ["root", "scen_g", "fr_display", "label"] + h_imp = _baobab_hierarchy_from_objects( + df_by_object_full, + "improved", + f"Improved ({lbl} vs A)", + baobab_ns, + baobab_nf, + ) + h_deg = _baobab_hierarchy_from_objects( + df_by_object_full, + "degraded", + f"Degraded ({lbl} vs A)", + baobab_ns, + baobab_nf, + ) + pair_both = (not h_imp.empty) and (not h_deg.empty) + plot_entries = [] + for ct, hdf, cmap in ( + ("improved", h_imp, IMPROVED_SCALE), + ("degraded", h_deg, DEGRADED_SCALE), + ): + if hdf.empty: + plot_entries.append((ct, None)) + continue + title = f"{baobab_viz}: {ct} (n = {int(hdf['n'].sum())} GT objects)" + if baobab_viz == "Sunburst": + fig_b = px.sunburst( + hdf, + path=path_cols, + values="n", + color="n", + color_continuous_scale=cmap, + title=title, + ) + h_sb = 480 if pair_both else 620 + apply_chart_theme(fig_b, height=h_sb, margin=dict(t=36, l=4, r=4, b=4)) + else: + fig_b = px.treemap( + hdf, + path=path_cols, + values="n", + color="n", + color_continuous_scale=cmap, + title=title, + ) + h_tr = 440 if pair_both else 520 + apply_chart_theme(fig_b, height=h_tr, margin=dict(t=40, l=4, r=4, b=4)) + plot_entries.append((ct, fig_b)) + + two_up = ( + len(plot_entries) == 2 + and plot_entries[0][1] is not None + and plot_entries[1][1] is not None + ) + if two_up: + bc1, bc2 = st.columns(2, gap="small") + with bc1: + st.plotly_chart( + plot_entries[0][1], + width='stretch', + key=f"{b_key}_fig_{plot_entries[0][0]}", + ) + with bc2: + st.plotly_chart( + plot_entries[1][1], + width='stretch', + key=f"{b_key}_fig_{plot_entries[1][0]}", + ) + else: + for ct, fig_b in plot_entries: + if fig_b is not None: + st.plotly_chart( + fig_b, + width='stretch', + key=f"{b_key}_fig_{ct}", + ) + else: + st.caption(f"No **{ct}** objects to chart.") + + # --- Comparison lens: label / scenario / frame (treemap trio, Baobab-aligned) --- + query_label = f""" WITH base_gt AS ( SELECT t4dataset_id, frame_index, uuid AS gt_uuid, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base, - COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + COALESCE(MAX(try_cast(label AS VARCHAR)), '') AS label, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base FROM view_eval_flat WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL AND {filter_clause_base} @@ -1984,10 +2424,8 @@ def _plot_comparison_lens_treemap( t4dataset_id, frame_index, uuid AS gt_uuid, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp, - COALESCE(MAX(try_cast(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(try_cast(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(try_cast(t4dataset_name AS VARCHAR)), '') AS t4dataset_name + COALESCE(MAX(try_cast(label AS VARCHAR)), '') AS label, + COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp FROM {comp_flat} WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL AND {filter_clause_comp_p5} @@ -1995,853 +2433,659 @@ def _plot_comparison_lens_treemap( ), joined AS ( SELECT - COALESCE(CAST(b.t4dataset_id AS VARCHAR), CAST(c.t4dataset_id AS VARCHAR)) AS t4dataset_id, - COALESCE(CAST(b.frame_index AS VARCHAR), CAST(c.frame_index AS VARCHAR)) AS frame_index, - COALESCE(b.gt_uuid, c.gt_uuid) AS gt_uuid, + COALESCE(b.label, c.label) AS label, COALESCE(b.tp_base, FALSE) AS tp_base, - COALESCE(c.tp_comp, FALSE) AS tp_comp, - COALESCE(b.suite_name, c.suite_name, '') AS suite_name, - COALESCE(b.scenario_name, c.scenario_name, '') AS scenario_name, - COALESCE(b.t4dataset_name, c.t4dataset_name, '') AS t4dataset_name + COALESCE(c.tp_comp, FALSE) AS tp_comp FROM base_gt b FULL OUTER JOIN comp_gt c ON b.t4dataset_id = c.t4dataset_id AND b.frame_index = c.frame_index AND b.gt_uuid = c.gt_uuid - ), - obj_attrs AS ( - SELECT - t4dataset_id, - frame_index, - uuid, - MAX(CAST(label AS VARCHAR)) AS label, - MAX(dist_h) AS dist_h - FROM view_eval_flat - WHERE source = 'GT' - GROUP BY 1, 2, 3 ) SELECT - j.t4dataset_id, - j.frame_index, - j.gt_uuid, - COALESCE(e.label, '') AS label, - COALESCE(e.dist_h, 0.0) AS dist_h, - {_DIST_BIN_CASE.replace("dist_h", "COALESCE(e.dist_h, 0.0)")} AS distance_bin, - j.suite_name, - j.scenario_name, - j.t4dataset_name, - CASE - WHEN NOT j.tp_base AND j.tp_comp THEN 'improved' - WHEN j.tp_base AND NOT j.tp_comp THEN 'degraded' - WHEN j.tp_base AND j.tp_comp THEN 'both_tp' - ELSE 'both_fn' - END AS change_type, - j.tp_base, - j.tp_comp - FROM joined j - LEFT JOIN obj_attrs e - ON CAST(j.t4dataset_id AS VARCHAR) = CAST(e.t4dataset_id AS VARCHAR) - AND j.frame_index = CAST(e.frame_index AS VARCHAR) - AND j.gt_uuid = e.uuid - ORDER BY change_type, j.t4dataset_id, j.frame_index + label, + CAST(COUNT(*) FILTER (WHERE TRUE) AS DOUBLE) AS total_gt, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt, + CAST(COUNT(*) FILTER (WHERE tp_base AND tp_comp) AS DOUBLE) AS both_tp_cnt, + CAST(COUNT(*) FILTER (WHERE NOT tp_base AND NOT tp_comp) AS DOUBLE) AS both_fn_cnt, + CAST(SUM((CASE WHEN tp_comp THEN 1 ELSE 0 END) - (CASE WHEN tp_base THEN 1 ELSE 0 END)) AS DOUBLE) AS net_tp_delta + FROM joined + GROUP BY label + ORDER BY net_tp_delta DESC """ - try: - df_by_frame = con.execute(query_frame_p5).df() - except Exception: - df_by_frame = pd.DataFrame() - try: - df_by_object_full = con.execute(query_object_p5).df() - except Exception: - df_by_object_full = pd.DataFrame() - - tot_imp = float(df_improved["improved_cnt"].sum()) - tot_deg = float(df_improved["degraded_cnt"].sum()) - tot_net = tot_imp - tot_deg - net_s = f"+{int(tot_net)}" if tot_net > 0 else str(int(tot_net)) - - with st.expander(f"Run {lbl} vs A", expanded=(len(runs) == 2)): - c1, c2, c3, c4 = st.columns(4) - c1.metric("Improved (FN→TP)", int(tot_imp)) - c2.metric("Degraded (TP→FN)", int(tot_deg)) - c3.metric("Net TP delta", net_s) - c4.caption("Start with scenarios and frames with the most **degraded** counts.") - st.markdown( - f"**Summary:** Net **{net_s}** TP vs baseline A — " - f"**{int(tot_deg)}** degraded vs **{int(tot_imp)}** improved." - ) - - b_key = f"p5_baobab_{lbl}_{idx}" - c1b, c2b, c3b = st.columns([1, 1, 1]) - with c1b: - baobab_viz = st.radio( - "Chart type", - ["Sunburst", "Treemap"], - horizontal=True, - key=f"{b_key}_viz", - ) - with c2b: - baobab_ns = st.slider( - "Max scenarios", - min_value=5, - max_value=25, - value=15, - key=f"{b_key}_ns", - ) - with c3b: - baobab_nf = st.slider( - "Max frames / scenario", - min_value=5, - max_value=20, - value=10, - key=f"{b_key}_nf", - ) - if df_by_object_full.empty: - st.caption("No object-level rows for hierarchy.") - else: - path_cols = ["root", "scen_g", "fr_display", "label"] - h_imp = _baobab_hierarchy_from_objects( - df_by_object_full, - "improved", - f"Improved ({lbl} vs A)", - baobab_ns, - baobab_nf, - ) - h_deg = _baobab_hierarchy_from_objects( - df_by_object_full, - "degraded", - f"Degraded ({lbl} vs A)", - baobab_ns, - baobab_nf, - ) - pair_both = (not h_imp.empty) and (not h_deg.empty) - plot_entries = [] - for ct, hdf, cmap in ( - ("improved", h_imp, IMPROVED_SCALE), - ("degraded", h_deg, DEGRADED_SCALE), - ): - if hdf.empty: - plot_entries.append((ct, None)) - continue - title = f"{baobab_viz}: {ct} (n = {int(hdf['n'].sum())} GT objects)" - if baobab_viz == "Sunburst": - fig_b = px.sunburst( - hdf, - path=path_cols, - values="n", - color="n", - color_continuous_scale=cmap, - title=title, + df_by_label = pd.DataFrame() + try: + df_by_label = con.execute(query_label).df() + except Exception as e_label: + st.caption(f"Label query: {e_label}") + + scen_agg = pd.DataFrame() + if not df_improved.empty: + scen_agg = ( + df_improved.groupby("scenario_name", dropna=False) + .agg( + improved_cnt=("improved_cnt", "sum"), + degraded_cnt=("degraded_cnt", "sum"), + ) + .reset_index() + ) + scen_agg = scen_agg.sort_values( + by=["degraded_cnt", "improved_cnt"], + ascending=[False, True], + ) + + df_frame_sorted = pd.DataFrame() + if not df_by_frame.empty: + df_frame_sorted = df_by_frame.sort_values( + by=["degraded_cnt", "improved_cnt"], + ascending=[False, True], + ).reset_index(drop=True) + + root_lens = f"{lbl} vs A" + lc1, lc2, lc3 = st.columns(3, gap="small") + with lc1: + if not df_by_label.empty: + tdf_l = _comparison_lens_treemap_df( + df_by_label["label"], + df_by_label["improved_cnt"], + df_by_label["degraded_cnt"], + root_lens, + ) + _plot_comparison_lens_treemap( + tdf_l, + f"p5_lens_lab_{lbl}_{idx}", + "By class", ) - h_sb = 480 if pair_both else 620 - apply_chart_theme(fig_b, height=h_sb, margin=dict(t=36, l=4, r=4, b=4)) else: - fig_b = px.treemap( - hdf, - path=path_cols, - values="n", - color="n", - color_continuous_scale=cmap, - title=title, + st.caption("_No label data._") + with lc2: + if not scen_agg.empty: + tdf_s = _comparison_lens_treemap_df( + scen_agg["scenario_name"].astype(str), + scen_agg["improved_cnt"], + scen_agg["degraded_cnt"], + root_lens, ) - h_tr = 440 if pair_both else 520 - apply_chart_theme(fig_b, height=h_tr, margin=dict(t=40, l=4, r=4, b=4)) - plot_entries.append((ct, fig_b)) - - two_up = ( - len(plot_entries) == 2 - and plot_entries[0][1] is not None - and plot_entries[1][1] is not None - ) - if two_up: - bc1, bc2 = st.columns(2, gap="small") - with bc1: - st.plotly_chart( - plot_entries[0][1], + _plot_comparison_lens_treemap( + tdf_s, + f"p5_lens_scen_{lbl}_{idx}", + "By scenario", + ) + else: + st.caption("_No scenario data._") + with lc3: + if not df_frame_sorted.empty: + fr_cap = 36 + fr_top = df_frame_sorted.head(fr_cap).copy() + nms = ( + fr_top["scenario_name"].astype(str).str.slice(0, 26) + + "\n· f" + + fr_top["frame_index"].astype(str) + ).tolist() + ims = fr_top["improved_cnt"].astype(float).tolist() + dgs = fr_top["degraded_cnt"].astype(float).tolist() + rest = df_frame_sorted.iloc[fr_cap:] + if not rest.empty: + io = float(rest["improved_cnt"].sum()) + do = float(rest["degraded_cnt"].sum()) + if io > 0 or do > 0: + nms.append( + f"Other frames\n({len(rest)} frames)" + ) + ims.append(io) + dgs.append(do) + tdf_f = _comparison_lens_treemap_df( + pd.Series(nms), + pd.Series(ims), + pd.Series(dgs), + root_lens, + ) + _plot_comparison_lens_treemap( + tdf_f, + f"p5_lens_fr_{lbl}_{idx}", + "By frame", + ) + st.caption( + f"Top **{fr_cap}** frames by degraded, plus **Other frames** " + f"so totals match **By class** / **By scenario**." + ) + else: + st.caption("_No frame data._") + + with st.expander("Tables behind the lens (label / scenario / frame)"): + if not df_by_label.empty: + st.markdown("**Per label**") + st.dataframe( + df_by_label, width='stretch', - key=f"{b_key}_fig_{plot_entries[0][0]}", + hide_index=True, ) - with bc2: - st.plotly_chart( - plot_entries[1][1], + if not scen_agg.empty: + st.markdown("**Per scenario**") + st.dataframe(scen_agg, width='stretch', hide_index=True) + if not df_frame_sorted.empty: + st.markdown("**Per frame** (sorted by degraded)") + st.dataframe( + df_frame_sorted.head(200), width='stretch', - key=f"{b_key}_fig_{plot_entries[1][0]}", + hide_index=True, ) - else: - for ct, fig_b in plot_entries: - if fig_b is not None: - st.plotly_chart( - fig_b, - width='stretch', - key=f"{b_key}_fig_{ct}", - ) - else: - st.caption(f"No **{ct}** objects to chart.") - - # --- Comparison lens: label / scenario / frame (treemap trio, Baobab-aligned) --- - query_label = f""" - WITH base_gt AS ( - SELECT - t4dataset_id, - frame_index, - uuid AS gt_uuid, - COALESCE(MAX(try_cast(label AS VARCHAR)), '') AS label, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_base - FROM view_eval_flat - WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL - AND {filter_clause_base} - GROUP BY 1, 2, 3 - ), - comp_gt AS ( - SELECT - t4dataset_id, - frame_index, - uuid AS gt_uuid, - COALESCE(MAX(try_cast(label AS VARCHAR)), '') AS label, - COUNT(*) FILTER (WHERE status = 'TP') > 0 AS tp_comp - FROM {comp_flat} - WHERE source = 'GT' AND uuid IS NOT NULL AND frame_index IS NOT NULL - AND {filter_clause_comp_p5} - GROUP BY 1, 2, 3 - ), - joined AS ( - SELECT - COALESCE(b.label, c.label) AS label, - COALESCE(b.tp_base, FALSE) AS tp_base, - COALESCE(c.tp_comp, FALSE) AS tp_comp - FROM base_gt b - FULL OUTER JOIN comp_gt c - ON b.t4dataset_id = c.t4dataset_id - AND b.frame_index = c.frame_index - AND b.gt_uuid = c.gt_uuid - ) - SELECT - label, - CAST(COUNT(*) FILTER (WHERE TRUE) AS DOUBLE) AS total_gt, - CAST(COUNT(*) FILTER (WHERE NOT tp_base AND tp_comp) AS DOUBLE) AS improved_cnt, - CAST(COUNT(*) FILTER (WHERE tp_base AND NOT tp_comp) AS DOUBLE) AS degraded_cnt, - CAST(COUNT(*) FILTER (WHERE tp_base AND tp_comp) AS DOUBLE) AS both_tp_cnt, - CAST(COUNT(*) FILTER (WHERE NOT tp_base AND NOT tp_comp) AS DOUBLE) AS both_fn_cnt, - CAST(SUM((CASE WHEN tp_comp THEN 1 ELSE 0 END) - (CASE WHEN tp_base THEN 1 ELSE 0 END)) AS DOUBLE) AS net_tp_delta - FROM joined - GROUP BY label - ORDER BY net_tp_delta DESC - """ - df_by_label = pd.DataFrame() - try: - df_by_label = con.execute(query_label).df() - except Exception as e_label: - st.caption(f"Label query: {e_label}") - - scen_agg = pd.DataFrame() - if not df_improved.empty: - scen_agg = ( - df_improved.groupby("scenario_name", dropna=False) - .agg( - improved_cnt=("improved_cnt", "sum"), - degraded_cnt=("degraded_cnt", "sum"), + + with st.expander("Full dataset breakdown (per t4dataset_id row)"): + st.dataframe(df_improved, width='stretch', hide_index=True) + + # --- Drill-down: filters + objects --- + with st.expander("Drill-down: objects"): + scen_key = f"p5_scen_{lbl}_{idx}" + t4_key = f"p5_t4_{lbl}_{idx}" + lab_key = f"p5_lab_{lbl}_{idx}" + for k, default in ((scen_key, []), (t4_key, []), (lab_key, [])): + if k not in st.session_state: + st.session_state[k] = default + + scenarios_all = sorted( + df_improved["scenario_name"].dropna().astype(str).unique().tolist() ) - .reset_index() - ) - scen_agg = scen_agg.sort_values( - by=["degraded_cnt", "improved_cnt"], - ascending=[False, True], - ) - - df_frame_sorted = pd.DataFrame() - if not df_by_frame.empty: - df_frame_sorted = df_by_frame.sort_values( - by=["degraded_cnt", "improved_cnt"], - ascending=[False, True], - ).reset_index(drop=True) - - root_lens = f"{lbl} vs A" - lc1, lc2, lc3 = st.columns(3, gap="small") - with lc1: - if not df_by_label.empty: - tdf_l = _comparison_lens_treemap_df( - df_by_label["label"], - df_by_label["improved_cnt"], - df_by_label["degraded_cnt"], - root_lens, + t4_all = sorted( + df_improved["t4dataset_name"].dropna().astype(str).unique().tolist() ) - _plot_comparison_lens_treemap( - tdf_l, - f"p5_lens_lab_{lbl}_{idx}", - "By class", + labels_all = ( + sorted(df_by_object_full["label"].dropna().astype(str).unique().tolist()) + if not df_by_object_full.empty + else [] ) - else: - st.caption("_No label data._") - with lc2: - if not scen_agg.empty: - tdf_s = _comparison_lens_treemap_df( - scen_agg["scenario_name"].astype(str), - scen_agg["improved_cnt"], - scen_agg["degraded_cnt"], - root_lens, + # Keep prior picks valid so Streamlit does not reset widgets when options refresh + scenarios_opts = sorted( + set(scenarios_all) | set(st.session_state.get(scen_key, []) or []) ) - _plot_comparison_lens_treemap( - tdf_s, - f"p5_lens_scen_{lbl}_{idx}", - "By scenario", + t4_opts = sorted(set(t4_all) | set(st.session_state.get(t4_key, []) or [])) + labels_opts = sorted( + set(labels_all) | set(st.session_state.get(lab_key, []) or []) ) - else: - st.caption("_No scenario data._") - with lc3: - if not df_frame_sorted.empty: - fr_cap = 36 - fr_top = df_frame_sorted.head(fr_cap).copy() - nms = ( - fr_top["scenario_name"].astype(str).str.slice(0, 26) - + "\n· f" - + fr_top["frame_index"].astype(str) - ).tolist() - ims = fr_top["improved_cnt"].astype(float).tolist() - dgs = fr_top["degraded_cnt"].astype(float).tolist() - rest = df_frame_sorted.iloc[fr_cap:] - if not rest.empty: - io = float(rest["improved_cnt"].sum()) - do = float(rest["degraded_cnt"].sum()) - if io > 0 or do > 0: - nms.append( - f"Other frames\n({len(rest)} frames)" + + pr1, pr2 = st.columns(2) + with pr1: + if st.button( + "Preset: top 5 degraded scenarios", + key=f"p5_pre_scen_{lbl}_{idx}", + ): + if not df_improved.empty: + sa = ( + df_improved.groupby("scenario_name", dropna=False)[ + "degraded_cnt" + ] + .sum() + .sort_values(ascending=False) + .head(5) + ) + st.session_state[scen_key] = [ + str(x) for x in sa.index.tolist() + ] + st.rerun() + fr_multiselect_key = f"p5_frkeys_{lbl}_{idx}" + if fr_multiselect_key not in st.session_state: + st.session_state[fr_multiselect_key] = [] + frame_key_labels = {} + if not df_frame_sorted.empty: + for _, rw in df_frame_sorted.head(40).iterrows(): + fk = f"{rw['t4dataset_id']}|{rw['frame_index']}" + # Use scenario_name (not suite_name) for frame option labels + frame_key_labels[fk] = ( + f"{str(rw.get('scenario_name', ''))[:36]} | " + f"f{rw['frame_index']} | deg {int(rw['degraded_cnt'])}" ) - ims.append(io) - dgs.append(do) - tdf_f = _comparison_lens_treemap_df( - pd.Series(nms), - pd.Series(ims), - pd.Series(dgs), - root_lens, - ) - _plot_comparison_lens_treemap( - tdf_f, - f"p5_lens_fr_{lbl}_{idx}", - "By frame", - ) - st.caption( - f"Top **{fr_cap}** frames by degraded, plus **Other frames** " - f"so totals match **By class** / **By scenario**." + with pr2: + if st.button( + "Preset: top 10 degraded frames (object filter)", + key=f"p5_pre_fr_{lbl}_{idx}", + ): + if frame_key_labels: + topk = list(frame_key_labels.keys())[:10] + st.session_state[fr_multiselect_key] = topk + st.rerun() + + colf1, colf2, colf3 = st.columns(3) + with colf1: + if scenarios_opts: + st.multiselect( + "Filter scenario_name", + scenarios_opts, + key=scen_key, + ) + else: + st.caption("No scenarios.") + with colf2: + if t4_opts: + st.multiselect( + "Filter t4dataset_name", + t4_opts, + key=t4_key, + ) + else: + st.caption("No t4dataset_name.") + with colf3: + if labels_opts: + st.multiselect( + "Filter label", + labels_opts, + key=lab_key, + ) + else: + st.caption("No labels.") + + prev_fr = st.session_state.get(fr_multiselect_key) or [] + base_frame_keys = list(frame_key_labels.keys()) + for k in prev_fr: + if k not in frame_key_labels: + frame_key_labels[k] = f"(selected) frame {str(k).split('|')[-1]}" + frame_opts_keys = base_frame_keys + [ + k for k in prev_fr if k not in base_frame_keys + ] + if frame_opts_keys: + st.multiselect( + "Limit objects to frames (optional)", + options=frame_opts_keys, + format_func=lambda k: frame_key_labels.get(k, k), + key=fr_multiselect_key, + ) + + change_type_filter = st.selectbox( + "Change type", + ["degraded", "improved", "all", "both_tp", "both_fn"], + key=f"change_type_{lbl}_{idx}", + help="Filter objects by TP change between runs.", ) - else: - st.caption("_No frame data._") - - with st.expander("Tables behind the lens (label / scenario / frame)"): - if not df_by_label.empty: - st.markdown("**Per label**") - st.dataframe( - df_by_label, - width='stretch', - hide_index=True, + sort_obj = st.selectbox( + "Sort objects by", + [ + "degraded_priority_then_dist", + "frame_then_uuid", + "label_then_dist", + ], + key=f"p5_sort_{lbl}_{idx}", ) - if not scen_agg.empty: - st.markdown("**Per scenario**") - st.dataframe(scen_agg, width='stretch', hide_index=True) - if not df_frame_sorted.empty: - st.markdown("**Per frame** (sorted by degraded)") - st.dataframe( - df_frame_sorted.head(200), - width='stretch', - hide_index=True, + + df_obj_show = ( + df_by_object_full.copy() + if not df_by_object_full.empty + else pd.DataFrame() ) - - with st.expander("Full dataset breakdown (per t4dataset_id row)"): - st.dataframe(df_improved, width='stretch', hide_index=True) - - # --- Drill-down: filters + objects --- - with st.expander("Drill-down: objects"): - scen_key = f"p5_scen_{lbl}_{idx}" - t4_key = f"p5_t4_{lbl}_{idx}" - lab_key = f"p5_lab_{lbl}_{idx}" - for k, default in ((scen_key, []), (t4_key, []), (lab_key, [])): - if k not in st.session_state: - st.session_state[k] = default - - scenarios_all = sorted( - df_improved["scenario_name"].dropna().astype(str).unique().tolist() - ) - t4_all = sorted( - df_improved["t4dataset_name"].dropna().astype(str).unique().tolist() - ) - labels_all = ( - sorted(df_by_object_full["label"].dropna().astype(str).unique().tolist()) - if not df_by_object_full.empty - else [] - ) - # Keep prior picks valid so Streamlit does not reset widgets when options refresh - scenarios_opts = sorted( - set(scenarios_all) | set(st.session_state.get(scen_key, []) or []) - ) - t4_opts = sorted(set(t4_all) | set(st.session_state.get(t4_key, []) or [])) - labels_opts = sorted( - set(labels_all) | set(st.session_state.get(lab_key, []) or []) - ) - - pr1, pr2 = st.columns(2) - with pr1: - if st.button( - "Preset: top 5 degraded scenarios", - key=f"p5_pre_scen_{lbl}_{idx}", - ): - if not df_improved.empty: - sa = ( - df_improved.groupby("scenario_name", dropna=False)[ - "degraded_cnt" - ] - .sum() - .sort_values(ascending=False) - .head(5) - ) - st.session_state[scen_key] = [ - str(x) for x in sa.index.tolist() + if not df_obj_show.empty: + ss = st.session_state.get(scen_key) or [] + if ss: + df_obj_show = df_obj_show[ + df_obj_show["scenario_name"].astype(str).isin(ss) ] - st.rerun() - fr_multiselect_key = f"p5_frkeys_{lbl}_{idx}" - if fr_multiselect_key not in st.session_state: - st.session_state[fr_multiselect_key] = [] - frame_key_labels = {} - if not df_frame_sorted.empty: - for _, rw in df_frame_sorted.head(40).iterrows(): - fk = f"{rw['t4dataset_id']}|{rw['frame_index']}" - # Use scenario_name (not suite_name) for frame option labels - frame_key_labels[fk] = ( - f"{str(rw.get('scenario_name', ''))[:36]} | " - f"f{rw['frame_index']} | deg {int(rw['degraded_cnt'])}" - ) - with pr2: - if st.button( - "Preset: top 10 degraded frames (object filter)", - key=f"p5_pre_fr_{lbl}_{idx}", - ): - if frame_key_labels: - topk = list(frame_key_labels.keys())[:10] - st.session_state[fr_multiselect_key] = topk - st.rerun() - - colf1, colf2, colf3 = st.columns(3) - with colf1: - if scenarios_opts: - st.multiselect( - "Filter scenario_name", - scenarios_opts, - key=scen_key, - ) - else: - st.caption("No scenarios.") - with colf2: - if t4_opts: - st.multiselect( - "Filter t4dataset_name", - t4_opts, - key=t4_key, - ) - else: - st.caption("No t4dataset_name.") - with colf3: - if labels_opts: - st.multiselect( - "Filter label", - labels_opts, - key=lab_key, - ) - else: - st.caption("No labels.") - - prev_fr = st.session_state.get(fr_multiselect_key) or [] - base_frame_keys = list(frame_key_labels.keys()) - for k in prev_fr: - if k not in frame_key_labels: - frame_key_labels[k] = f"(selected) frame {str(k).split('|')[-1]}" - frame_opts_keys = base_frame_keys + [ - k for k in prev_fr if k not in base_frame_keys - ] - if frame_opts_keys: - st.multiselect( - "Limit objects to frames (optional)", - options=frame_opts_keys, - format_func=lambda k: frame_key_labels.get(k, k), - key=fr_multiselect_key, + tt = st.session_state.get(t4_key) or [] + if tt: + df_obj_show = df_obj_show[ + df_obj_show["t4dataset_name"].astype(str).isin(tt) + ] + ll = st.session_state.get(lab_key) or [] + if ll: + df_obj_show = df_obj_show[ + df_obj_show["label"].astype(str).isin(ll) + ] + fk_sel = st.session_state.get(fr_multiselect_key) or [] + if fk_sel: + fk_set = set(fk_sel) + df_obj_show = df_obj_show[ + ( + df_obj_show["t4dataset_id"].astype(str) + + "|" + + df_obj_show["frame_index"].astype(str) + ).isin(fk_set) + ] + if change_type_filter != "all": + df_obj_show = df_obj_show[ + df_obj_show["change_type"] == change_type_filter + ] + if sort_obj == "degraded_priority_then_dist": + df_obj_show = df_obj_show.copy() + df_obj_show["_prio"] = df_obj_show["change_type"].map( + { + "degraded": 0, + "improved": 1, + "both_tp": 2, + "both_fn": 3, + } + ) + df_obj_show = df_obj_show.sort_values( + by=["_prio", "dist_h"], + ascending=[True, True], + ).drop(columns=["_prio"], errors="ignore") + elif sort_obj == "frame_then_uuid": + df_obj_show = df_obj_show.sort_values( + by=["t4dataset_id", "frame_index", "gt_uuid"] + ) + else: + df_obj_show = df_obj_show.sort_values( + by=["label", "dist_h", "t4dataset_id", "frame_index"] + ) + + n_show = 200 + st.caption( + f"Showing up to {n_show} rows; use **Download CSV** for the full filtered list." ) - - change_type_filter = st.selectbox( - "Change type", - ["degraded", "improved", "all", "both_tp", "both_fn"], - key=f"change_type_{lbl}_{idx}", - help="Filter objects by TP change between runs.", - ) - sort_obj = st.selectbox( - "Sort objects by", - [ - "degraded_priority_then_dist", - "frame_then_uuid", - "label_then_dist", - ], - key=f"p5_sort_{lbl}_{idx}", - ) - - df_obj_show = ( - df_by_object_full.copy() - if not df_by_object_full.empty - else pd.DataFrame() - ) - if not df_obj_show.empty: - ss = st.session_state.get(scen_key) or [] - if ss: - df_obj_show = df_obj_show[ - df_obj_show["scenario_name"].astype(str).isin(ss) - ] - tt = st.session_state.get(t4_key) or [] - if tt: - df_obj_show = df_obj_show[ - df_obj_show["t4dataset_name"].astype(str).isin(tt) - ] - ll = st.session_state.get(lab_key) or [] - if ll: - df_obj_show = df_obj_show[ - df_obj_show["label"].astype(str).isin(ll) - ] - fk_sel = st.session_state.get(fr_multiselect_key) or [] - if fk_sel: - fk_set = set(fk_sel) - df_obj_show = df_obj_show[ - ( - df_obj_show["t4dataset_id"].astype(str) - + "|" - + df_obj_show["frame_index"].astype(str) - ).isin(fk_set) - ] - if change_type_filter != "all": - df_obj_show = df_obj_show[ - df_obj_show["change_type"] == change_type_filter - ] - if sort_obj == "degraded_priority_then_dist": - df_obj_show = df_obj_show.copy() - df_obj_show["_prio"] = df_obj_show["change_type"].map( - { - "degraded": 0, - "improved": 1, - "both_tp": 2, - "both_fn": 3, - } + if not df_obj_show.empty: + st.download_button( + label="Download filtered objects (CSV)", + data=df_obj_show.to_csv(index=False).encode("utf-8"), + file_name=f"perception_diff_{lbl}_vs_A_objects.csv", + mime="text/csv", + key=f"p5_dl_{lbl}_{idx}", ) - df_obj_show = df_obj_show.sort_values( - by=["_prio", "dist_h"], - ascending=[True, True], - ).drop(columns=["_prio"], errors="ignore") - elif sort_obj == "frame_then_uuid": - df_obj_show = df_obj_show.sort_values( - by=["t4dataset_id", "frame_index", "gt_uuid"] + st.dataframe( + df_obj_show.head(n_show), + width='stretch', + hide_index=True, ) else: - df_obj_show = df_obj_show.sort_values( - by=["label", "dist_h", "t4dataset_id", "frame_index"] - ) - - n_show = 200 - st.caption( - f"Showing up to {n_show} rows; use **Download CSV** for the full filtered list." - ) - if not df_obj_show.empty: - st.download_button( - label="Download filtered objects (CSV)", - data=df_obj_show.to_csv(index=False).encode("utf-8"), - file_name=f"perception_diff_{lbl}_vs_A_objects.csv", - mime="text/csv", - key=f"p5_dl_{lbl}_{idx}", - ) - st.dataframe( - df_obj_show.head(n_show), - width='stretch', - hide_index=True, - ) - else: - st.caption("No objects match filters.") - - with st.expander("Full frame table (sort: degraded desc)"): - if not df_frame_sorted.empty: - st.dataframe(df_frame_sorted, width='stretch', hide_index=True) - else: - st.caption("No frame breakdown.") - else: - st.caption(f"Run {lbl} vs A: No data.") - except Exception as e: - st.error(f"Error (Run {lbl} vs A): {e}") - finally: - _pd_slot.empty() - -# ============================= -# Single mode: Frame / Object level — Where are the misses? -# ============================= -if single_mode: - st.markdown(section_header_html("Frame / Object level: Where are the misses?"), unsafe_allow_html=True) - _fn_slot = st.empty() - _fn_slot.markdown(ds_spot_loading_markup("FN by frame & object"), unsafe_allow_html=True) - try: - with st.expander("FN by frame and by object", expanded=True): - query_fn_frame = f""" - SELECT - t4dataset_id, - frame_index, - COALESCE(MAX(CAST(scenario_name AS VARCHAR)), '') AS scenario_name, - COALESCE(MAX(CAST(suite_name AS VARCHAR)), '') AS suite_name, - COALESCE(MAX(CAST(t4dataset_name AS VARCHAR)), '') AS t4dataset_name, - COUNT(*) AS fn_cnt - FROM view_eval_flat - WHERE source = 'GT' AND status = 'FN' AND {filter_clause_base} - GROUP BY t4dataset_id, frame_index - ORDER BY fn_cnt DESC - """ - df_fn_frame = con.execute(query_fn_frame).df() - query_fn_object = f""" - SELECT - t4dataset_id, - frame_index, - uuid, - COALESCE(CAST(label AS VARCHAR), '') AS label, - dist_h, - COALESCE(CAST(scenario_name AS VARCHAR), '') AS scenario_name, - COALESCE(CAST(suite_name AS VARCHAR), '') AS suite_name - FROM view_eval_flat - WHERE source = 'GT' AND status = 'FN' AND {filter_clause_base} - ORDER BY t4dataset_id, frame_index, uuid - """ - df_fn_object = con.execute(query_fn_object).df() - if not df_fn_frame.empty: - st.markdown("**FN count by frame**") - st.download_button("Download FN by frame (CSV)", data=df_fn_frame.to_csv(index=False).encode("utf-8"), file_name="fn_by_frame.csv", mime="text/csv", key="dl_fn_frame") - st.dataframe(df_fn_frame, width='stretch', hide_index=True) - else: - st.caption("No FN by frame.") - if not df_fn_object.empty: - st.markdown("**FN objects**") - if len(df_fn_object) > 500: - st.caption(f"Showing first 500 of {len(df_fn_object)} FN objects.") - st.dataframe(df_fn_object.head(500), width='stretch', hide_index=True) + st.caption("No objects match filters.") + + with st.expander("Full frame table (sort: degraded desc)"): + if not df_frame_sorted.empty: + st.dataframe(df_frame_sorted, width='stretch', hide_index=True) + else: + st.caption("No frame breakdown.") else: - st.dataframe(df_fn_object, width='stretch', hide_index=True) - else: - st.caption("No FN objects.") - except Exception as e: - st.error(f"Error in FN by frame/object: {e}") - finally: - _fn_slot.empty() - -# ============================= -# Panel 6: Mean Error (single) / Mean Error Comparison (compare) -# ============================= -st.divider() -st.markdown( - section_header_html( - "Mean Error" + (" Comparison" if not single_mode else ""), - "Mean absolute error on TP matches (X/Y in m, Yaw in rad)." - + (" Compare mode: choose grouped bars or spider charts." if not single_mode else ""), - ), - unsafe_allow_html=True, -) - -try: - sample_query = "SELECT * FROM view_eval_flat LIMIT 1" - sample_df = con.execute(sample_query).df() - has_error_cols = all(col in sample_df.columns for col in ['x_error', 'y_error', 'yaw_error']) -except Exception: - has_error_cols = False - -if not has_error_cols: - st.info("Error columns (x_error, y_error, yaw_error) not found in data. Skipping error analysis.") -else: + st.caption(f"Run {lbl} vs A: No data.") + except Exception as e: + st.error(f"Error (Run {lbl} vs A): {e}") + finally: + _pd_slot.empty() + + # ============================= + # Single mode: Frame / Object level — Where are the misses? + # ============================= if single_mode: + ds_dlog("section: Frame_FN_misses_start") + st.markdown(section_header_html("Frame / Object level: Where are the misses?"), unsafe_allow_html=True) + _fn_slot = st.empty() + _fn_slot.markdown(ds_spot_loading_markup("FN by frame & object"), unsafe_allow_html=True) try: - with ds_spot_loading("Mean error"): - query = f""" + with st.expander("FN by frame and by object", expanded=True): + query_fn_frame = f""" SELECT - label, - AVG(ABS(CAST(x_error AS DOUBLE))) FILTER ( - WHERE status = 'TP' AND x_error IS NOT NULL - ) AS mean_abs_x_error, - AVG(ABS(CAST(y_error AS DOUBLE))) FILTER ( - WHERE status = 'TP' AND y_error IS NOT NULL - ) AS mean_abs_y_error, - AVG(ABS(CAST(yaw_error AS DOUBLE))) FILTER ( - WHERE status = 'TP' AND yaw_error IS NOT NULL - ) AS mean_abs_yaw_error + t4dataset_id, + frame_index, + COALESCE(MAX(CAST(scenario_name AS VARCHAR)), '') AS scenario_name, + COALESCE(MAX(CAST(suite_name AS VARCHAR)), '') AS suite_name, + COALESCE(MAX(CAST(t4dataset_name AS VARCHAR)), '') AS t4dataset_name, + COUNT(*) AS fn_cnt FROM view_eval_flat - WHERE {filter_clause_base} - GROUP BY label - ORDER BY label + WHERE source = 'GT' AND status = 'FN' AND {filter_clause_base} + GROUP BY t4dataset_id, frame_index + ORDER BY fn_cnt DESC """ - df_error_base = con.execute(query).df() - if not df_error_base.empty: - fig = go.Figure() - fig.add_trace(go.Bar( - x=df_error_base['label'], - y=df_error_base['mean_abs_x_error'], - name='X Error', - marker_color=RUN_COLORS[0], - )) - fig.add_trace(go.Bar( - x=df_error_base['label'], - y=df_error_base['mean_abs_y_error'], - name='Y Error', - marker_color=RUN_COLORS[1], - )) - fig.add_trace(go.Bar( - x=df_error_base['label'], - y=df_error_base['mean_abs_yaw_error'], - name='Yaw Error', - marker_color=RUN_COLORS[2], - )) - apply_chart_theme(fig) - fig.update_layout( - title=f"Mean Error within {max_eval_range} [m]", - xaxis_title="Label", - yaxis_title="Error [m] or [rad]", - barmode='group' - ) - st.plotly_chart(fig, width="stretch") - else: - st.info("No data available") + df_fn_frame = con.execute(query_fn_frame).df() + query_fn_object = f""" + SELECT + t4dataset_id, + frame_index, + uuid, + COALESCE(CAST(label AS VARCHAR), '') AS label, + dist_h, + COALESCE(CAST(scenario_name AS VARCHAR), '') AS scenario_name, + COALESCE(CAST(suite_name AS VARCHAR), '') AS suite_name + FROM view_eval_flat + WHERE source = 'GT' AND status = 'FN' AND {filter_clause_base} + ORDER BY t4dataset_id, frame_index, uuid + """ + df_fn_object = con.execute(query_fn_object).df() + if not df_fn_frame.empty: + st.markdown("**FN count by frame**") + st.download_button("Download FN by frame (CSV)", data=df_fn_frame.to_csv(index=False).encode("utf-8"), file_name="fn_by_frame.csv", mime="text/csv", key="dl_fn_frame") + st.dataframe(df_fn_frame, width='stretch', hide_index=True) + else: + st.caption("No FN by frame.") + if not df_fn_object.empty: + st.markdown("**FN objects**") + if len(df_fn_object) > 500: + st.caption(f"Showing first 500 of {len(df_fn_object)} FN objects.") + st.dataframe(df_fn_object.head(500), width='stretch', hide_index=True) + else: + st.dataframe(df_fn_object, width='stretch', hide_index=True) + else: + st.caption("No FN objects.") except Exception as e: - st.error(f"Error: {e}") + st.error(f"Error in FN by frame/object: {e}") + finally: + _fn_slot.empty() + + # ============================= + # Panel 6: Mean Error (single) / Mean Error Comparison (compare) + # ============================= + ds_dlog("section: Panel6_Mean_Error_start") + st.divider() + st.markdown( + section_header_html( + "Mean Error" + (" Comparison" if not single_mode else ""), + "Mean absolute error on TP matches (X/Y in m, Yaw in rad)." + + (" Compare mode: choose grouped bars or spider charts." if not single_mode else ""), + ), + unsafe_allow_html=True, + ) + + try: + sample_query = "SELECT * FROM view_eval_flat LIMIT 1" + sample_df = con.execute(sample_query).df() + has_error_cols = all(col in sample_df.columns for col in ['x_error', 'y_error', 'yaw_error']) + except Exception: + has_error_cols = False + + if not has_error_cols: + st.info("Error columns (x_error, y_error, yaw_error) not found in data. Skipping error analysis.") else: - try: - with ds_spot_loading("Mean error"): - dfs_err = [] - for i in range(len(runs)): - fc = build_filter_clause(filters_list[i]) - q = f""" + if single_mode: + try: + with ds_spot_loading("Mean error"): + query = f""" SELECT label, - AVG(ABS(CAST(x_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND x_error IS NOT NULL) AS mean_abs_x_error, - AVG(ABS(CAST(y_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND y_error IS NOT NULL) AS mean_abs_y_error, - AVG(ABS(CAST(yaw_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND yaw_error IS NOT NULL) AS mean_abs_yaw_error - FROM {_flat_view(i)} - WHERE {fc} + AVG(ABS(CAST(x_error AS DOUBLE))) FILTER ( + WHERE status = 'TP' AND x_error IS NOT NULL + ) AS mean_abs_x_error, + AVG(ABS(CAST(y_error AS DOUBLE))) FILTER ( + WHERE status = 'TP' AND y_error IS NOT NULL + ) AS mean_abs_y_error, + AVG(ABS(CAST(yaw_error AS DOUBLE))) FILTER ( + WHERE status = 'TP' AND yaw_error IS NOT NULL + ) AS mean_abs_yaw_error + FROM view_eval_flat + WHERE {filter_clause_base} GROUP BY label ORDER BY label """ - df_i = con.execute(q).df() - df_i["run"] = run_labels_list[i] - dfs_err.append(df_i) - df_err_melt = pd.concat(dfs_err, ignore_index=True) - if not df_err_melt.empty: - mean_err_viz = st.radio( - "Mean error chart style", - options=["Spider chart (X, Y & Yaw)", "Grouped bar"], - index=0, - horizontal=True, - key="mean_err_compare_viz", - ) - if mean_err_viz == "Grouped bar": - for err_type, col in [ - ("X Error", "mean_abs_x_error"), - ("Y Error", "mean_abs_y_error"), - ("Yaw Error", "mean_abs_yaw_error"), - ]: - fig = px.bar( - df_err_melt, - x="label", - y=col, - color="run", - barmode="group", - title=f"Mean {err_type} within {max_eval_range} [m] by run", - labels={"label": "Label", col: err_type, "run": "Run"}, - color_discrete_sequence=RUN_COLORS, - ) - apply_chart_theme(fig) - st.plotly_chart(fig, width="stretch") + df_error_base = con.execute(query).df() + if not df_error_base.empty: + fig = go.Figure() + fig.add_trace(go.Bar( + x=df_error_base['label'], + y=df_error_base['mean_abs_x_error'], + name='X Error', + marker_color=RUN_COLORS[0], + )) + fig.add_trace(go.Bar( + x=df_error_base['label'], + y=df_error_base['mean_abs_y_error'], + name='Y Error', + marker_color=RUN_COLORS[1], + )) + fig.add_trace(go.Bar( + x=df_error_base['label'], + y=df_error_base['mean_abs_yaw_error'], + name='Yaw Error', + marker_color=RUN_COLORS[2], + )) + apply_chart_theme(fig) + fig.update_layout( + title=f"Mean Error within {max_eval_range} [m]", + xaxis_title="Label", + yaxis_title="Error [m] or [rad]", + barmode='group' + ) + st.plotly_chart(fig, width="stretch") else: - st.caption( - f"Three spiders: mean |error| per label per run (TP only), within **{max_eval_range} m** " - "(same as sidebar max range)." + st.info("No data available") + except Exception as e: + st.error(f"Error: {e}") + else: + try: + with ds_spot_loading("Mean error"): + dfs_err = [] + for i in range(len(runs)): + fc = build_filter_clause(filters_list[i]) + q = f""" + SELECT + label, + AVG(ABS(CAST(x_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND x_error IS NOT NULL) AS mean_abs_x_error, + AVG(ABS(CAST(y_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND y_error IS NOT NULL) AS mean_abs_y_error, + AVG(ABS(CAST(yaw_error AS DOUBLE))) FILTER (WHERE status = 'TP' AND yaw_error IS NOT NULL) AS mean_abs_yaw_error + FROM {_flat_view(i)} + WHERE {fc} + GROUP BY label + ORDER BY label + """ + df_i = con.execute(q).df() + df_i["run"] = run_labels_list[i] + dfs_err.append(df_i) + df_err_melt = pd.concat(dfs_err, ignore_index=True) + if not df_err_melt.empty: + mean_err_viz = st.radio( + "Mean error chart style", + options=["Spider chart (X, Y & Yaw)", "Grouped bar"], + index=0, + horizontal=True, + key="mean_err_compare_viz", ) - cats = sorted(df_err_melt["label"].astype(str).unique()) - if len(cats) > 16: - st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") - rcols = st.columns(3) - err_specs = [ - ( - f"Mean |x error| (within {max_eval_range} m)", - "mean_abs_x_error", - "Mean |x error| (m)", - ".3f", - ), - ( - f"Mean |y error| (within {max_eval_range} m)", - "mean_abs_y_error", - "Mean |y error| (m)", - ".3f", - ), - ( - f"Mean |yaw error| (within {max_eval_range} m)", - "mean_abs_yaw_error", - "Mean |yaw error| (rad)", - ".4f", - ), - ] - for ci, (chart_title, col, hover_lbl, tfmt) in enumerate(err_specs): - fig_r = _scalar_metric_spider_compare( - df_err_melt, - cats, - chart_title, - run_labels_list, - col, - hover_lbl, - height=400, - tickformat=tfmt, + if mean_err_viz == "Grouped bar": + for err_type, col in [ + ("X Error", "mean_abs_x_error"), + ("Y Error", "mean_abs_y_error"), + ("Yaw Error", "mean_abs_yaw_error"), + ]: + fig = px.bar( + df_err_melt, + x="label", + y=col, + color="run", + barmode="group", + title=f"Mean {err_type} within {max_eval_range} [m] by run", + labels={"label": "Label", col: err_type, "run": "Run"}, + color_discrete_sequence=RUN_COLORS, + ) + apply_chart_theme(fig) + st.plotly_chart(fig, width="stretch") + else: + st.caption( + f"Three spiders: mean |error| per label per run (TP only), within **{max_eval_range} m** " + "(same as sidebar max range)." ) - with rcols[ci]: - st.plotly_chart(fig_r, width='stretch') - else: - st.info("No data available") - except Exception as e: - st.error(f"Error: {e}") - - st.markdown(section_header_html("Difference of mean absolute error (each run − Baseline A)"), unsafe_allow_html=True) - for idx in range(1, len(runs)): - lbl = run_labels_list[idx] - _med_slot = st.empty() - _med_slot.markdown(ds_spot_loading_markup(f"Mean error diff · run {lbl}"), unsafe_allow_html=True) - try: - fc_c = build_filter_clause(filters_list[idx]) - query = f""" - WITH topic_a AS ( - SELECT label, - AVG(ABS(x_error)) FILTER (WHERE status = 'TP') AS x_a, - AVG(ABS(y_error)) FILTER (WHERE status = 'TP') AS y_a, - AVG(ABS(yaw_error)) FILTER (WHERE status = 'TP') AS yaw_a - FROM view_eval_flat - WHERE {filter_clause_base} - GROUP BY label - ), - topic_c AS ( - SELECT label, - AVG(ABS(x_error)) FILTER (WHERE status = 'TP') AS x_c, - AVG(ABS(y_error)) FILTER (WHERE status = 'TP') AS y_c, - AVG(ABS(yaw_error)) FILTER (WHERE status = 'TP') AS yaw_c - FROM {_flat_view(idx)} - WHERE {fc_c} - GROUP BY label - ) - SELECT a.label, - (c.x_c - a.x_a) AS x_diff, - (c.y_c - a.y_a) AS y_diff, - (c.yaw_c - a.yaw_a) AS yaw_diff - FROM topic_a a - JOIN topic_c c USING (label) - ORDER BY label - """ - df_ed = con.execute(query).df() - if not df_ed.empty: - with st.expander(f"Run {lbl} − A", expanded=(len(runs) == 2)): - fig = go.Figure() - fig.add_trace(go.Bar(x=df_ed["label"], y=df_ed["x_diff"], name="X Diff", marker_color=RUN_COLORS[0])) - fig.add_trace(go.Bar(x=df_ed["label"], y=df_ed["y_diff"], name="Y Diff", marker_color=RUN_COLORS[1])) - fig.add_trace(go.Bar(x=df_ed["label"], y=df_ed["yaw_diff"], name="Yaw Diff", marker_color=RUN_COLORS[2])) - apply_chart_theme(fig) - fig.update_layout(title=f"Error diff ({lbl} − A) within {max_eval_range} [m]", xaxis_title="Label", yaxis_title="Error Difference [m] or [rad]", barmode="group") - st.plotly_chart(fig, width="stretch") + cats = sorted(df_err_melt["label"].astype(str).unique()) + if len(cats) > 16: + st.caption("Spider charts work best with ≤16 labels; many classes may look crowded.") + rcols = st.columns(3) + err_specs = [ + ( + f"Mean |x error| (within {max_eval_range} m)", + "mean_abs_x_error", + "Mean |x error| (m)", + ".3f", + ), + ( + f"Mean |y error| (within {max_eval_range} m)", + "mean_abs_y_error", + "Mean |y error| (m)", + ".3f", + ), + ( + f"Mean |yaw error| (within {max_eval_range} m)", + "mean_abs_yaw_error", + "Mean |yaw error| (rad)", + ".4f", + ), + ] + for ci, (chart_title, col, hover_lbl, tfmt) in enumerate(err_specs): + fig_r = _scalar_metric_spider_compare( + df_err_melt, + cats, + chart_title, + run_labels_list, + col, + hover_lbl, + height=400, + tickformat=tfmt, + ) + with rcols[ci]: + st.plotly_chart(fig_r, width='stretch') + else: + st.info("No data available") except Exception as e: - st.error(f"Error (Run {lbl} − A): {e}") - finally: - _med_slot.empty() + st.error(f"Error: {e}") + + st.markdown(section_header_html("Difference of mean absolute error (each run − Baseline A)"), unsafe_allow_html=True) + for idx in range(1, len(runs)): + lbl = run_labels_list[idx] + _med_slot = st.empty() + _med_slot.markdown(ds_spot_loading_markup(f"Mean error diff · run {lbl}"), unsafe_allow_html=True) + try: + fc_c = build_filter_clause(filters_list[idx]) + query = f""" + WITH topic_a AS ( + SELECT label, + AVG(ABS(x_error)) FILTER (WHERE status = 'TP') AS x_a, + AVG(ABS(y_error)) FILTER (WHERE status = 'TP') AS y_a, + AVG(ABS(yaw_error)) FILTER (WHERE status = 'TP') AS yaw_a + FROM view_eval_flat + WHERE {filter_clause_base} + GROUP BY label + ), + topic_c AS ( + SELECT label, + AVG(ABS(x_error)) FILTER (WHERE status = 'TP') AS x_c, + AVG(ABS(y_error)) FILTER (WHERE status = 'TP') AS y_c, + AVG(ABS(yaw_error)) FILTER (WHERE status = 'TP') AS yaw_c + FROM {_flat_view(idx)} + WHERE {fc_c} + GROUP BY label + ) + SELECT a.label, + (c.x_c - a.x_a) AS x_diff, + (c.y_c - a.y_a) AS y_diff, + (c.yaw_c - a.yaw_a) AS yaw_diff + FROM topic_a a + JOIN topic_c c USING (label) + ORDER BY label + """ + df_ed = con.execute(query).df() + if not df_ed.empty: + with st.expander(f"Run {lbl} − A", expanded=(len(runs) == 2)): + fig = go.Figure() + fig.add_trace(go.Bar(x=df_ed["label"], y=df_ed["x_diff"], name="X Diff", marker_color=RUN_COLORS[0])) + fig.add_trace(go.Bar(x=df_ed["label"], y=df_ed["y_diff"], name="Y Diff", marker_color=RUN_COLORS[1])) + fig.add_trace(go.Bar(x=df_ed["label"], y=df_ed["yaw_diff"], name="Yaw Diff", marker_color=RUN_COLORS[2])) + apply_chart_theme(fig) + fig.update_layout(title=f"Error diff ({lbl} − A) within {max_eval_range} [m]", xaxis_title="Label", yaxis_title="Error Difference [m] or [rad]", barmode="group") + st.plotly_chart(fig, width="stretch") + except Exception as e: + st.error(f"Error (Run {lbl} − A): {e}") + finally: + _med_slot.empty() + + ds_dlog("main_content_try_exit_ok") + ds_debug_log_memory("main_content_end") + +except Exception as _e_ds_main: + ds_debug_log_exception("detection_stats_main_try", _e_ds_main) + raise -_ds_loading_banner.empty() +finally: + try: + ds_debug_render_expander(st.session_state) + except Exception as _e_dbg_exp: + ds_debug_log_exception("ds_debug_render_expander", _e_dbg_exp) + ds_dlog("main_content_finally_banner_clear") + _ds_loading_banner.empty() + ds_dlog("detection_stats_script_run_complete") diff --git a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py index 8df5009..1d36567 100644 --- a/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py +++ b/evaluation_dashboard_app/pages/4_Bounding_Box_Viewer.py @@ -1,5 +1,8 @@ +import html import duckdb +import requests import streamlit as st +import streamlit.components.v1 as components import plotly.graph_objects as go import plotly.express as px import numpy as np @@ -12,6 +15,17 @@ from lib.parquet_schema import schema_flags from lib.page_chrome import inject_app_page_styles, render_loaded_data_section, render_page_hero from lib.ui.bounding_box_viewer_ui import bev_overlay_line_and_status_legend_markup, bev_status_legend_markup +from lib.t4_dataset_embed import t4_share_query_params +from lib.t4_three_layers import resolve_t4_dataset_id, resolve_t4_scenario +from lib.t4_visualizer_client import ( + DEFAULT_BASE_URL, + ENV_BASE_URL, + RenderRequest, + TargetObjectIn, + T4VisualizerClient, + T4VisualizerError, + target_object_from_gt_row, +) st.set_page_config( layout="wide", @@ -311,6 +325,36 @@ def list_parquets_in_run(run_path) -> List[str]: ) compare_view_mode = "overlay" if "Overlay" in compare_view_mode else "side_by_side" +# --- T4 visualizer (base URL + preview mode in sidebar) +with st.sidebar: + st.markdown("##### T4 visualizer") + st.caption("Uses **GET /datasets/{id}/availability** first; preview runs only if the server reports the dataset is available.") + if "bbox_t4_base_url" not in st.session_state: + st.session_state["bbox_t4_base_url"] = ( + (os.environ.get(ENV_BASE_URL) or DEFAULT_BASE_URL).strip() or DEFAULT_BASE_URL + ) + st.text_input( + "T4 server base URL", + key="bbox_t4_base_url", + help=f"Default from env `{ENV_BASE_URL}`; needs **GET /render/html** (iframe) and **POST /render** (PNG mode).", + ) + _t4_mode = st.radio( + "T4 preview", + ["html_iframe", "post_png"], + format_func=lambda m: ( + "HTML iframe (/render/html)" if m == "html_iframe" else "POST /render (PNGs here)" + ), + key="bbox_t4_preview_mode", + horizontal=True, + ) + if _t4_mode == "post_png": + _t4p1, _t4p2 = st.columns(2) + with _t4p1: + st.checkbox("Crop cameras", value=True, key="bbox_t4_crop_cameras") + st.checkbox("Show dataset annotations", value=True, key="bbox_t4_show_ann") + with _t4p2: + st.checkbox("Draw GT rows as target boxes", value=True, key="bbox_t4_overlay_gt") + # ---------------------------- # Build query safely & load data @@ -329,9 +373,14 @@ def list_parquets_in_run(run_path) -> List[str]: params.extend(selected_visibility) select_extras = (", " + ", ".join(hover_extra_cols)) if hover_extra_cols else "" +# Optional columns for T4 server overlay (z/height) and resolving dataset / scenario per row +_geom_for_t4 = [c for c in ("z", "height") if c in cols and c not in hover_extra_cols] +_geom_select = (", " + ", ".join(_geom_for_t4)) if _geom_for_t4 else "" +_t4_meta_cols = [c for c in ("t4dataset_id", "t4dataset_name", "scenario_name") if c in cols] +_t4_meta_select = (", " + ", ".join(_t4_meta_cols)) if _t4_meta_cols else "" sql = f""" SELECT frame_index, x, y, length, width, yaw, label, topic_name, source, status, uuid -{select_vis}{select_extras} +{select_vis}{select_extras}{_geom_select}{_t4_meta_select} FROM parquet_scan(?) WHERE {" AND ".join(where)} ORDER BY frame_index @@ -447,6 +496,257 @@ def get_color(source, status): return color_map.get((source, status), "#999999") with k4: st.metric("TP (EST)", tp_est_count) with k5: st.metric("TPR", f"{tpr_frame:.2%}" if tpr_frame is not None else "—") +# ---------------------------- +# T4 visualizer (HTTP server): camera PNGs for current frame +# ---------------------------- +def _bbox_t4_request_key( + ds: str, + sc: str, + frame_idx: int, + base_url: str, + crop: bool, + show_ann: bool, + overlay_gt: bool, +) -> Tuple[Any, ...]: + return ( + str(ds), + str(sc), + int(frame_idx), + str(base_url).rstrip("/"), + bool(crop), + bool(show_ann), + bool(overlay_gt), + ) + + +_t4_preview_mode = st.session_state.get("bbox_t4_preview_mode", "html_iframe") + +base_url_t4 = (st.session_state.get("bbox_t4_base_url") or "").strip() or DEFAULT_BASE_URL + +_ds_t4 = resolve_t4_dataset_id(df_frame) +if not _ds_t4 and selected_t4dataset is not None: + _ds_t4 = str(selected_t4dataset) +_sc_t4 = resolve_t4_scenario(df_frame, selected_scenario) + +if not _ds_t4: + for _k in ( + "bbox_t4_last_images", + "bbox_t4_last_meta", + "bbox_t4_success_key", + "bbox_t4_error_key", + "bbox_t4_error_msg", + "bbox_t4_availability", + ): + st.session_state.pop(_k, None) + st.caption("T4 camera preview is not available for this scene.") + with st.expander("Details", expanded=False): + st.markdown( + "Needs parquet **t4dataset_id** or **t4dataset_name** (or **t4dataset_name** in the sidebar when " + "multiple datasets exist). " + "The Tier4 HTTP visualizer (`t4-server`) must serve that dataset. " + f"Set **T4 server base URL** in the sidebar or `{ENV_BASE_URL}`." + ) +else: + _t4_avail_cache_key = f"{base_url_t4.rstrip('/')}|{_ds_t4}" + _cached_av = st.session_state.get("bbox_t4_availability") + _need_avail_fetch = _cached_av is None or _cached_av.get("cache_key") != _t4_avail_cache_key + if _need_avail_fetch: + try: + with st.spinner("Checking T4 dataset on the server…"): + _av_client = T4VisualizerClient(base_url=base_url_t4, timeout=30.0) + _av_data = _av_client.dataset_availability(_ds_t4) + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": True, + "available": bool(_av_data.get("available")), + "data": _av_data, + "error": None, + } + except T4VisualizerError as ex: + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": False, + "available": False, + "data": None, + "error": f"T4 server error ({ex.status_code}): {ex}", + } + except (OSError, requests.RequestException) as ex: + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": False, + "available": False, + "data": None, + "error": f"Network error: {ex}", + } + except Exception as ex: + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": False, + "available": False, + "data": None, + "error": f"Availability check failed: {ex}", + } + + _av = st.session_state.get("bbox_t4_availability") or {} + + if not _av.get("ok"): + st.caption("T4 preview skipped — could not verify dataset on the visualizer server.") + with st.expander("Details", expanded=False): + st.markdown(_av.get("error") or "Unknown error.") + elif not _av.get("available"): + st.caption("T4 preview skipped — this dataset is not on the visualizer server host.") + with st.expander("Details", expanded=False): + _d = _av.get("data") + if isinstance(_d, dict) and _d: + st.json(_d) + else: + st.markdown( + "The server reported **available: false** (no local dataset path for this id on the machine " + "running `t4-server`)." + ) + else: + _q_three = t4_share_query_params(_ds_t4, _sc_t4, int(frame)) + _viewer_three_url = f"{base_url_t4.rstrip('/')}/viewer/three?{_q_three}" + st.caption("**3D viewer** (Three.js, GT / pred / matched layers) lives on a dedicated page.") + c3d_a, c3d_b = st.columns([1, 2]) + with c3d_a: + st.page_link("pages/5_T4_3D_Viewer.py", label="Open T4 3D Viewer", icon="🧊") + with c3d_b: + st.markdown(f"[Open `/viewer/three` in new tab]({_viewer_three_url})") + + if not _av.get("ok") or not _av.get("available"): + pass + elif _t4_preview_mode == "html_iframe": + _q = t4_share_query_params(_ds_t4, _sc_t4, int(frame)) + _render_html_url = f"{base_url_t4.rstrip('/')}/render/html?{_q}" + st.markdown(f"[Open in new tab]({_render_html_url})") + _iframe_h = 900 + # Iframe shell: neutral gray while the document loads (avoid #141418 — reads as a black box for ~2s until + # the large /render/html response paints; inner page still sets its own dark background). + components.html( + f'', + height=_iframe_h + 24, + scrolling=True, + ) + elif not _sc_t4: + st.caption("POST /render mode needs **scenario_name** (sidebar or parquet) for this scene.") + with st.expander("Details", expanded=False): + st.markdown( + "Pick a **Scenario name** in the sidebar or ensure parquet includes **scenario_name**. " + "Alternatively switch to **HTML iframe** mode if the server accepts an empty scenario for your dataset." + ) + else: + t4_crop = bool(st.session_state.get("bbox_t4_crop_cameras", True)) + t4_show_ann = bool(st.session_state.get("bbox_t4_show_ann", True)) + t4_overlay_gt = bool(st.session_state.get("bbox_t4_overlay_gt", True)) + + _req_key = _bbox_t4_request_key( + _ds_t4, + _sc_t4, + int(frame), + base_url_t4, + t4_crop, + t4_show_ann, + t4_overlay_gt, + ) + _ok_key = st.session_state.get("bbox_t4_success_key") + _bad_key = st.session_state.get("bbox_t4_error_key") + + _should_fetch = _req_key != _ok_key and _req_key != _bad_key + + if _should_fetch: + try: + with st.spinner("Loading T4 camera renders… (usually ~2 seconds)"): + client = T4VisualizerClient( + base_url=base_url_t4, + timeout=120.0, + ) + targets = [] + if t4_overlay_gt: + for _, row in df_frame[df_frame["source"] == "GT"].iterrows(): + d = target_object_from_gt_row(row.to_dict()) + targets.append(TargetObjectIn(**d)) + req = RenderRequest( + t4dataset_id=_ds_t4, + scenario_name=_sc_t4, + frame_index=int(frame), + target_objects=targets, + crop_cameras=t4_crop, + show_annotations=t4_show_ann, + ) + t4_res = client.render(req) + _imgs = t4_res.decode_all_images() + if not _imgs: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.session_state["bbox_t4_error_key"] = _req_key + st.session_state["bbox_t4_error_msg"] = ( + "T4 server returned no camera images for this frame. " + "Check that the dataset and scenario exist on the server and the frame index is valid." + ) + st.session_state.pop("bbox_t4_success_key", None) + else: + st.session_state["bbox_t4_last_images"] = _imgs + st.session_state["bbox_t4_last_meta"] = { + "sample_token": t4_res.sample_token, + "timestamp_us": t4_res.timestamp_us, + "frame_index": int(frame), + "t4dataset_id": _ds_t4, + "scenario_name": _sc_t4, + } + st.session_state["bbox_t4_success_key"] = _req_key + st.session_state.pop("bbox_t4_error_key", None) + st.session_state.pop("bbox_t4_error_msg", None) + except T4VisualizerError as ex: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.session_state.pop("bbox_t4_success_key", None) + st.session_state["bbox_t4_error_key"] = _req_key + st.session_state["bbox_t4_error_msg"] = f"T4 server error ({ex.status_code}): {ex}" + except (OSError, requests.RequestException) as ex: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.session_state.pop("bbox_t4_success_key", None) + st.session_state["bbox_t4_error_key"] = _req_key + st.session_state["bbox_t4_error_msg"] = f"Network error: {ex}" + except Exception as ex: + st.session_state.pop("bbox_t4_last_images", None) + st.session_state.pop("bbox_t4_last_meta", None) + st.session_state.pop("bbox_t4_success_key", None) + st.session_state["bbox_t4_error_key"] = _req_key + st.session_state["bbox_t4_error_msg"] = f"T4 render failed: {ex}" + + _meta = st.session_state.get("bbox_t4_last_meta") + _imgs = st.session_state.get("bbox_t4_last_images") + _show_err = st.session_state.get("bbox_t4_error_msg") + + st.caption( + f"**Request:** t4dataset_id `{_ds_t4}` · scenario_name `{_sc_t4}` · frame_index `{frame}`" + ) + if _req_key == st.session_state.get("bbox_t4_error_key") and _show_err: + st.caption("T4 camera preview could not be loaded.") + with st.expander("Details", expanded=False): + st.caption( + f"t4dataset_id `{_ds_t4}` · scenario_name `{_sc_t4}` · frame_index `{frame}` · " + f"server `{base_url_t4}`" + ) + st.markdown(_show_err) + elif _meta and _imgs: + st.caption( + f"**sample_token** `{_meta.get('sample_token', '')}` · " + f"**timestamp_us** `{_meta.get('timestamp_us', '')}`" + ) + _nc = min(3, max(1, len(_imgs))) + for _row_start in range(0, len(_imgs), _nc): + _cols_img = st.columns(_nc) + for _j, _k in enumerate(range(_row_start, min(_row_start + _nc, len(_imgs)))): + _lbl, _png = _imgs[_k] + with _cols_img[_j]: + st.caption(_lbl) + st.image(_png, use_container_width=True) + # ---------------------------- # Quick view: switch between "All (comparison)" and single-run view # ---------------------------- diff --git a/evaluation_dashboard_app/pages/5_T4_3D_Viewer.py b/evaluation_dashboard_app/pages/5_T4_3D_Viewer.py new file mode 100644 index 0000000..4334181 --- /dev/null +++ b/evaluation_dashboard_app/pages/5_T4_3D_Viewer.py @@ -0,0 +1,494 @@ +"""T4 dataset Three.js viewer: GT / prediction / matched 3D boxes via postMessage to `/viewer/three`.""" + +import duckdb +import requests +import streamlit as st +import numpy as np +import pandas as pd +import os +from pathlib import Path +from typing import Any, List + +from lib.path_utils import path_display +from lib.page_chrome import inject_app_page_styles, render_loaded_data_section, render_page_hero +from lib.t4_dataset_embed import t4_share_query_params +from lib.t4_three_layers import ( + build_three_layer_payload_all_frames, + infer_external_bbox_alignment_query_params, + render_t4_three_js_embed, + resolve_t4_dataset_id, + resolve_t4_scenario, +) +from lib.t4_visualizer_client import ( + DEFAULT_BASE_URL, + ENV_BASE_URL, + T4VisualizerClient, + T4VisualizerError, +) + +st.set_page_config( + layout="wide", + page_title="T4 3D Viewer", + page_icon="🧊", + initial_sidebar_state="expanded", +) +inject_app_page_styles() + +# ============================= +# Session state from Overview (run path) +# ============================= +if "runA" not in st.session_state: + st.warning("Please load data from the **Overview** page first (select mode and run(s)).") + st.stop() + +runA = st.session_state["runA"] +mode = st.session_state.get("mode", "Single Mode") +if mode == "Compare Mode": + all_runs = st.session_state.get("all_runs") + run_labels_state = st.session_state.get("run_labels") + if all_runs and run_labels_state and len(all_runs) >= 2: + runs = all_runs + run_labels_list = run_labels_state + else: + runB = st.session_state.get("runB") + runs = [runA] if runB is None else [runA, runB] + run_labels_list = ["A"] if len(runs) == 1 else ["A", "B"] +else: + runs = [runA] + run_labels_list = ["A"] + + +def list_parquets_in_run(run_path) -> List[str]: + p = Path(run_path) + if not p.is_dir(): + return [] + return sorted([str(f.resolve()) for f in p.glob("*.parquet")]) + + +parquet_lists = [list_parquets_in_run(r["path"]) for r in runs] +for i, (r, pl) in enumerate(zip(runs, parquet_lists)): + if not pl: + lbl = run_labels_list[i] if i < len(run_labels_list) else str(i) + st.error( + f"No parquet files in run ({lbl}): {path_display(r['path'])}. " + "Add a .parquet file or generate one from the Download page." + ) + st.stop() + +multi_run = len(runs) >= 2 + +_ld_entries = [] +for i, r in enumerate(runs): + lbl = run_labels_list[i] if i < len(run_labels_list) else str(i) + if lbl == "A": + _ltitle = "Baseline · A" + else: + _ltitle = f"Candidate · {lbl}" + _ld_entries.append((_ltitle, path_display(r["path"]))) +render_loaded_data_section(_ld_entries) +render_page_hero( + kicker="T4 visualizer", + title="T4 3D bounding box viewer", + description=( + "Embedded **Three.js** view with GT, prediction (EST), and UUID-matched pairs from parquet (**postMessage**). " + "Scrub **time inside the viewer** (bottom slider); eval boxes follow that frame. Same filters as the BEV page." + ), + mode=mode, +) + +# ---------------------------- +# Sidebar (Filters) — shared keys with Bounding Box Viewer +# ---------------------------- +with st.sidebar: + st.markdown("##### Filters") + st.caption("Same scene / topic / labels as the BEV viewer. Frame / playback: use the **3D viewer** controls.") + + if multi_run: + runs_to_show = st.multiselect( + "Runs to show", + run_labels_list, + default=run_labels_list, + key="bbox_viewer_runs_to_show", + ) + if not runs_to_show: + st.warning("Select at least one run.") + st.stop() + else: + runs_to_show = run_labels_list + + selected_files = {} + for i, lbl in enumerate(run_labels_list): + if lbl not in runs_to_show: + continue + pl = parquet_lists[i] + if len(pl) == 1: + selected_files[lbl] = pl[0] + else: + selected_files[lbl] = st.selectbox( + f"File (Run {lbl})", + pl, + format_func=os.path.basename, + key=f"bbox_viewer_file_{lbl}", + ) + + first_shown = runs_to_show[0] if runs_to_show else run_labels_list[0] + filter_file = selected_files.get(first_shown) or parquet_lists[run_labels_list.index(first_shown)][0] + +con = duckdb.connect() + +cols = con.execute("DESCRIBE SELECT * FROM parquet_scan(?)", [filter_file]).df()["column_name"].tolist() +has_visibility = "visibility" in cols +has_suite_name = "suite_name" in cols +has_scenario_name = "scenario_name" in cols +has_t4dataset_name = "t4dataset_name" in cols +hover_extra_cols = [c for c in ["z", "height", "vx", "vy", "confidence", "pointcloud_num"] if c in cols] + +scene_where = "1=1" +scene_params: List[str] = [filter_file] + +if has_suite_name: + suite_list = con.execute( + "SELECT DISTINCT suite_name AS v FROM parquet_scan(?) WHERE suite_name IS NOT NULL ORDER BY v", + [filter_file], + ).df()["v"].dropna().astype(str).tolist() +else: + suite_list = [] + +if "bbox_viewer_link_suite" in st.session_state: + _lsu = st.session_state.pop("bbox_viewer_link_suite", None) + if suite_list and _lsu is not None and str(_lsu) in suite_list: + st.session_state["bbox_viewer_suite"] = str(_lsu) + +with st.sidebar: + selected_suite = None + selected_scenario = None + if suite_list: + selected_suite = st.selectbox( + "Suite name", + suite_list, + key="bbox_viewer_suite", + ) + if has_scenario_name: + if selected_suite is not None: + scenario_list = con.execute( + "SELECT DISTINCT scenario_name AS v FROM parquet_scan(?) WHERE suite_name = ? AND scenario_name IS NOT NULL ORDER BY v", + [filter_file, selected_suite], + ).df()["v"].dropna().astype(str).tolist() + else: + scenario_list = con.execute( + "SELECT DISTINCT scenario_name AS v FROM parquet_scan(?) WHERE scenario_name IS NOT NULL ORDER BY v", + [filter_file], + ).df()["v"].dropna().astype(str).tolist() + if scenario_list: + if "bbox_viewer_link_scenario" in st.session_state: + _lsc = st.session_state.pop("bbox_viewer_link_scenario", None) + if _lsc is not None and str(_lsc) in scenario_list: + st.session_state["bbox_viewer_scenario"] = str(_lsc) + selected_scenario = st.selectbox( + "Scenario name", + scenario_list, + key="bbox_viewer_scenario", + ) + t4dataset_list: List[str] = [] + if has_t4dataset_name: + t4_where_parts = ["t4dataset_name IS NOT NULL"] + t4_params: List[Any] = [filter_file] + if selected_suite is not None: + t4_where_parts.insert(0, "suite_name = ?") + t4_params.append(selected_suite) + if selected_scenario is not None: + t4_where_parts.insert(0, "scenario_name = ?") + t4_params.insert(1, selected_scenario) + t4_where = " AND ".join(t4_where_parts) + t4dataset_list = con.execute( + f"SELECT DISTINCT t4dataset_name AS v FROM parquet_scan(?) WHERE {t4_where} ORDER BY v", + t4_params, + ).df()["v"].dropna().astype(str).tolist() + has_multiple_t4dataset = len(t4dataset_list) > 1 + selected_t4dataset = None + if has_multiple_t4dataset and t4dataset_list: + if "bbox_viewer_link_t4dataset" in st.session_state: + _lt4 = st.session_state.pop("bbox_viewer_link_t4dataset", None) + if _lt4 is not None and str(_lt4) in t4dataset_list: + st.session_state["bbox_viewer_t4dataset"] = str(_lt4) + selected_t4dataset = st.selectbox( + "t4dataset_name", + t4dataset_list, + key="bbox_viewer_t4dataset", + ) + +if selected_suite is not None: + scene_where = "suite_name = ?" + scene_params = [filter_file, selected_suite] +if selected_scenario is not None: + scene_where = scene_where + " AND scenario_name = ?" if scene_where != "1=1" else "scenario_name = ?" + scene_params = scene_params + [selected_scenario] +if selected_t4dataset is not None: + scene_where = scene_where + " AND t4dataset_name = ?" if scene_where != "1=1" else "t4dataset_name = ?" + scene_params = scene_params + [selected_t4dataset] +if scene_where == "1=1": + scene_params = [filter_file] + +topic_names = con.execute( + f"SELECT DISTINCT topic_name AS v FROM parquet_scan(?) WHERE {scene_where} ORDER BY v", + scene_params, +).df()["v"].dropna().tolist() +if not topic_names: + for key in ( + "bbox_viewer_scenario", + "bbox_viewer_suite", + "bbox_viewer_link_suite", + "bbox_viewer_link_scenario", + "bbox_viewer_link_t4dataset", + ): + if key in st.session_state: + del st.session_state[key] + st.warning( + "No topic_name for the selected scene (from Detection Stats link). " + "Cleared scene selection; please choose a scene from the sidebar." + ) + st.rerun() + +with st.sidebar: + selected_topic = st.selectbox("topic_name (single)", topic_names) + +labels = con.execute( + f"SELECT DISTINCT label AS v FROM parquet_scan(?) WHERE {scene_where} AND topic_name=? ORDER BY v", + scene_params + [selected_topic], +).df()["v"].dropna().tolist() +if not labels: + st.warning("No label for selected topic.") + st.stop() + +with st.sidebar: + selected_labels = st.multiselect("label(s)", labels, default=labels) + +selected_visibility = None +if has_visibility: + vis_list = con.execute( + f"SELECT DISTINCT COALESCE(visibility,'UNKNOWN') AS v FROM parquet_scan(?) WHERE {scene_where} AND topic_name=? ORDER BY v", + scene_params + [selected_topic], + ).df()["v"].tolist() + with st.sidebar: + if vis_list: + selected_visibility = st.multiselect("visibility", vis_list, default=vis_list) + else: + st.info("No visibility values found — skipping.") +else: + with st.sidebar: + st.info("No 'visibility' column found — skipping visibility filter.") + +if not selected_labels: + st.warning("No label selected.") + st.stop() + +with st.sidebar: + st.markdown("##### T4 server") + st.caption("**GET /datasets/{id}/availability** must succeed before the iframe loads.") + if "bbox_t4_base_url" not in st.session_state: + st.session_state["bbox_t4_base_url"] = ( + (os.environ.get(ENV_BASE_URL) or DEFAULT_BASE_URL).strip() or DEFAULT_BASE_URL + ) + st.text_input( + "T4 server base URL", + key="bbox_t4_base_url", + help=f"Default from env `{ENV_BASE_URL}`. Embeds `/viewer/three` and posts GT / pred / matched bbox layers.", + ) + +# ---------------------------- +# Load data (same SQL as Bounding Box Viewer) +# ---------------------------- +where = [scene_where, "topic_name = ?"] +params = scene_params + [selected_topic] +where.append(f"label IN ({','.join(['?']*len(selected_labels))})") +params.extend(selected_labels) + +if has_visibility and selected_visibility: + where.append(f"COALESCE(visibility,'UNKNOWN') IN ({','.join(['?']*len(selected_visibility))})") + params.extend(selected_visibility) + +_renderer_optional_cols = [ + "unix_time", + "frame_id", + "z", + "height", + "shape_type", + "vx", + "vy", + "confidence", + "pointcloud_num", + "visibility", + "x_error", + "y_error", + "z_error", + "yaw_error", + "vx_error", + "vy_error", + "speed_error", + "center_distance", + "plane_distance", + "pair_dt_sec", + "pair_uuid", + "dx_min", + "dy_min", + "t4dataset_id", + "suite_name", + "t4dataset_name", + "scenario_name", +] +_select_cols = [ + "frame_index", + "x", + "y", + "length", + "width", + "yaw", + "label", + "topic_name", + "source", + "status", + "uuid", +] +_select_cols.extend(c for c in _renderer_optional_cols if c in cols and c not in _select_cols) +sql = f""" +SELECT {", ".join(_select_cols)} +FROM parquet_scan(?) +WHERE {" AND ".join(where)} +ORDER BY frame_index +""" + +files_to_load: List[tuple] = [(selected_files[lbl], lbl) for lbl in runs_to_show if lbl in selected_files] +base_params = scene_params[1:] + [selected_topic] + list(selected_labels) +if has_visibility and selected_visibility: + base_params = base_params + list(selected_visibility) + +dfs = [] +for file_path, run_label in files_to_load: + qparams = [file_path] + base_params + df_part = con.execute(sql, qparams).df() + if not df_part.empty: + df_part = df_part.copy() + df_part["run"] = run_label + dfs.append(df_part) + +if not dfs: + st.warning("No data matches the selected filters.") + st.stop() + +df = pd.concat(dfs, ignore_index=True) +if len(files_to_load) == 1: + df["run"] = df["run"].iloc[0] + +if "frame_index" in df.columns and not np.issubdtype(df["frame_index"].dtype, np.integer): + df["frame_index"] = ( + pd.to_numeric(df["frame_index"], errors="coerce").fillna(0).astype(int) + ) + +if len(files_to_load) == 1: + st.info(f"**Currently showing:** Run {files_to_load[0][1]} only") +else: + run_names = [f[1] for f in files_to_load] + st.info(f"**Currently showing:** Runs {', '.join(run_names)} — 3D layers include boxes from all selected runs.") + +f_min, f_max = int(df.frame_index.min()), int(df.frame_index.max()) + +# One reference slice for resolving t4dataset_id / scenario_name (same as iframe entry frame). +_ref_frame = f_min +df_frame = df[df.frame_index == _ref_frame] +if df_frame.empty and not df.empty: + df_frame = df.iloc[:1].copy() + +# ---------------------------- +# T4 Three.js embed +# ---------------------------- +base_url_t4 = (st.session_state.get("bbox_t4_base_url") or "").strip() or DEFAULT_BASE_URL + +_ds_t4 = resolve_t4_dataset_id(df_frame) +if not _ds_t4 and selected_t4dataset is not None: + _ds_t4 = str(selected_t4dataset) +_sc_t4 = resolve_t4_scenario(df_frame, selected_scenario) + +if not _ds_t4: + for _k in ( + "bbox_t4_last_images", + "bbox_t4_last_meta", + "bbox_t4_success_key", + "bbox_t4_error_key", + "bbox_t4_error_msg", + "bbox_t4_availability", + ): + st.session_state.pop(_k, None) + st.warning( + "Cannot resolve a T4 dataset id for this frame. Needs parquet **t4dataset_id** or **t4dataset_name**, " + f"or **t4dataset_name** in the sidebar when multiple datasets exist. Set **T4 server base URL** or `{ENV_BASE_URL}`." + ) +else: + _t4_avail_cache_key = f"{base_url_t4.rstrip('/')}|{_ds_t4}" + _cached_av = st.session_state.get("bbox_t4_availability") + _need_avail_fetch = _cached_av is None or _cached_av.get("cache_key") != _t4_avail_cache_key + if _need_avail_fetch: + try: + with st.spinner("Checking T4 dataset on the server…"): + _av_client = T4VisualizerClient(base_url=base_url_t4, timeout=2.0) + _av_data = _av_client.dataset_availability(_ds_t4) + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": True, + "available": bool(_av_data.get("available")), + "data": _av_data, + "error": None, + } + except T4VisualizerError as ex: + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": False, + "available": False, + "data": None, + "error": f"T4 server error ({ex.status_code}): {ex}", + } + except (OSError, requests.RequestException) as ex: + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": False, + "available": False, + "data": None, + "error": f"Network error: {ex}", + } + except Exception as ex: + st.session_state["bbox_t4_availability"] = { + "cache_key": _t4_avail_cache_key, + "ok": False, + "available": False, + "data": None, + "error": f"Availability check failed: {ex}", + } + + _av = st.session_state.get("bbox_t4_availability") or {} + + if not _av.get("ok"): + st.error("Could not verify the dataset on the T4 visualizer server.") + with st.expander("Details", expanded=False): + st.markdown(_av.get("error") or "Unknown error.") + elif not _av.get("available"): + st.warning("This dataset is not available on the visualizer server host.") + with st.expander("Details", expanded=False): + _d = _av.get("data") + if isinstance(_d, dict) and _d: + st.json(_d) + else: + st.markdown( + "The server reported **available: false** (no local dataset path for this id on the machine " + "running `t4-server`)." + ) + else: + # Fixed entry frame so Streamlit slider does not reload the iframe; eval layers use bbox_layers_by_frame. + _iframe_entry_frame = int(df["frame_index"].min()) + _q_three = t4_share_query_params(_ds_t4, _sc_t4, _iframe_entry_frame) + _q_three = f"{_q_three}&{infer_external_bbox_alignment_query_params(df)}" + _viewer_three_url = f"{base_url_t4.rstrip('/')}/viewer/three?{_q_three}" + _layer_payload = build_three_layer_payload_all_frames(df) + + _viewer_three_h = 1400 + render_t4_three_js_embed(_viewer_three_url, _layer_payload, height=_viewer_three_h) + +st.page_link("pages/4_Bounding_Box_Viewer.py", label="Back to Bounding Box & BEV viewer", icon="🖼️") diff --git a/evaluation_dashboard_app/pages/5_Tools.py b/evaluation_dashboard_app/pages/5_Tools.py deleted file mode 100644 index 0dc8958..0000000 --- a/evaluation_dashboard_app/pages/5_Tools.py +++ /dev/null @@ -1,118 +0,0 @@ -import streamlit as st -import re -import subprocess - -from lib.page_chrome import inject_app_page_styles, render_page_hero - -st.set_page_config( - page_title="lsim_analysis_tool runner", - page_icon="⚙️", - layout="centered", -) -inject_app_page_styles() -render_page_hero( - kicker="CLI bridge", - title="lsim_analysis_tool runner", - description=( - "Paste Autoware Evaluator report or suite URLs, generate shell snippets, and run analysis commands " - "from a simple form." - ), - mode="Single Run", -) - -# Constants and regexes -JOB_RE = re.compile(r"/reports/([0-9a-fA-F-]{36})") -SUITE_RE = re.compile(r"/suites/([0-9a-fA-F-]{36})") -DEFAULT_REPORT_URL = ( - "https://evaluation.tier4.jp/evaluation/reports/" - "71b8eec9-7e28-5f9c-9b89-8e88545e742f?project_id=x2_dev" -) -DEFAULT_SUITE_URL = ( - "https://evaluation.tier4.jp/evaluation/suites/" - "1af11feb-362d-4c48-b258-02cd433a3866?project_id=x2_dev" -) -DEFAULT_OUTPUT = "~/data/x2gen2/evaluator_summary/NO_shorten_left_lower_gpu2_No3/" - -def extract_job_id(report_url): - m = JOB_RE.search(report_url or "") - return m.group(1) if m else "" - -def extract_suite_id(suite_url): - m = SUITE_RE.search(suite_url or "") - return m.group(1) if m else "" - -# App state initialization -if 'report_url' not in st.session_state: - st.session_state['report_url'] = DEFAULT_REPORT_URL -if 'suite_url' not in st.session_state: - st.session_state['suite_url'] = DEFAULT_SUITE_URL - -# Layout inputs -with st.form(key="eval_runner_form"): - col1, col2 = st.columns([1, 1]) - with col1: - project_id = st.text_input("Project ID", value="x2_dev", key="project_id") - setup_bash = st.text_area( - "setup.bash path", - value="/home/leigu/pilot-auto.x2.v4.3/install/setup.bash", - key="setup_bash", - height=120, - placeholder="Enter full path(s) to your setup.bash file(s), one per line." - ) - output_dir = st.text_area( - "Output Directory", - value=DEFAULT_OUTPUT, - key="output_dir", - height=120, - placeholder="Enter one or more output directories, one per line." - ) - with col2: - report_url = st.text_area( - "Report URL", - value=st.session_state['report_url'], - key="report_url", - height=120, - placeholder="Paste the full Evaluation Report URL here." - ) - suite_url = st.text_area( - "Suite URL", - value=st.session_state['suite_url'], - key="suite_url", - height=120, - placeholder="Paste the full Evaluation Suite URL here." - ) - - # Job ID and Suite ID auto-extracted from URL text fields live as you type - # So always extract from form inputs (not session state nor callbacks) - job_id = extract_job_id(report_url) - suite_id = extract_suite_id(suite_url) - - st.text_input("Job ID", value=job_id, key="job_id", disabled=True) - st.text_input("Suite ID", value=suite_id, key="suite_id", disabled=True) - - # Build command - cmd = ( - f"./perception_evaluation_result_creator2.sh " - f"{setup_bash} " - f"./perception_eval_result_summarizer.py " - f"{project_id} " - f"{job_id} " - f"{suite_id} " - f"{output_dir}" - ) - - # Submit button as required for Streamlit forms - submitted = st.form_submit_button("Run in Terminal") - -# "Run in Terminal" logic -if submitted: - st.info(f"Command to run (copy below and paste into your terminal):\n\n{cmd}") - - -st.markdown(""" ---- -**Instructions:** -- Enter your parameters above. -- Job ID / Suite ID are automatically parsed when you enter the Evaluation URLs. -- Click **Run in Terminal** to show the command for copy-paste. -""") \ No newline at end of file diff --git a/evaluation_dashboard_app/pages/6_Download.py b/evaluation_dashboard_app/pages/6_Download.py index 83bae3b..2222722 100644 --- a/evaluation_dashboard_app/pages/6_Download.py +++ b/evaluation_dashboard_app/pages/6_Download.py @@ -39,10 +39,11 @@ def _to_jst(dt: Any) -> Optional[datetime]: from lib.user_config import UserConfig from lib.path_utils import get_data_root, resolve_under_data_root, to_data_relative from lib.eval_summary import find_eval_result_dirs, run_eval_result_for_dir, generate_summary_and_score_csv -from lib.page_chrome import inject_app_page_styles +from lib.page_chrome import ( + inject_app_page_styles, +) from lib.ui.download_ui import ( ImpressiveProgressHUD, - TaskCardMode, render_detailed_scenario_download_panel, render_download_hero, render_download_status_table_intro, @@ -51,11 +52,11 @@ def _to_jst(dt: Any) -> Optional[datetime]: render_job_json_summary_panel, render_recent_scenario_downloads_intro, render_scenario_download_summary_panel, - render_task_list_empty_state, - task_list_card_markup, ) +from lib.ui.task_history import get_task_list_current_user, render_task_list from lib.ui.styles_download import inject_download_page_styles from lib.db import ( + count_recent_tasks, create_task, delete_task, get_task, @@ -64,6 +65,7 @@ def _to_jst(dt: Any) -> Optional[datetime]: update_task_rq_job_id, ) from lib import download_core +from lib import evaluator_api from lib.auth import get_current_user_id, is_auth_enabled try: @@ -75,6 +77,12 @@ def _to_jst(dt: Any) -> Optional[datetime]: # Task queue panel: time window + row cap (must match header + list_recent_tasks) _TASK_LIST_SINCE_DAYS = 7 _TASK_LIST_MAX_ROWS = 200 +_TASK_HISTORY_RANGE_OPTIONS = { + "7 days": 7, + "30 days": 30, + "90 days": 90, + "All": None, +} def _parse_rq_timeout_sec(raw: Optional[str], *, default: int, minimum: int) -> int: if raw is None or not str(raw).strip(): @@ -103,6 +111,71 @@ def _parse_rq_timeout_sec(raw: Optional[str], *, default: int, minimum: int) -> else: _BUILD_PARQUET_JOB_TIMEOUT_SEC = _RQ_DEFAULT_JOB_TIMEOUT_SEC +_DEFAULT_EVAL_WORKERS = 4 + + +def _default_eval_workers() -> int: + try: + workers = int(os.environ.get("EVAL_WORKERS_DEFAULT", _DEFAULT_EVAL_WORKERS)) + except (TypeError, ValueError): + workers = _DEFAULT_EVAL_WORKERS + return max(1, min(workers, 16)) + + +_APP_ROOT = Path(__file__).resolve().parents[1] +_CATALOGS_FILENAME = "catalogs.json" +_LEGACY_CATALOGS_PATH = Path("/home/leigu/EvaluatorRunnerUITest/catalogs.json") + + +def _catalog_preset_candidate_paths() -> List[Path]: + """Return catalog preset paths in priority order.""" + paths: List[Path] = [] + env_path = os.environ.get("EVAL_CATALOGS_PATH") + if env_path: + paths.append(Path(env_path).expanduser()) + + paths.extend( + [ + _APP_ROOT / _CATALOGS_FILENAME, + Path.cwd() / _CATALOGS_FILENAME, + _LEGACY_CATALOGS_PATH, + ] + ) + + unique_paths: List[Path] = [] + seen = set() + for path in paths: + key = os.fspath(path) + if key not in seen: + unique_paths.append(path) + seen.add(key) + return unique_paths + + +def _load_catalog_presets() -> tuple[List[Dict[str, Any]], Optional[Path], Optional[str]]: + """Load evaluator catalog presets from the first available catalogs.json.""" + required_keys = {"display_name", "catalog_id", "integration_id"} + for path in _catalog_preset_candidate_paths(): + if not path.is_file(): + continue + + try: + with path.open("r", encoding="utf-8") as f: + presets = json.load(f) + if not isinstance(presets, list): + raise ValueError("catalog preset file must contain a JSON list") + + valid_presets = [ + preset + for preset in presets + if isinstance(preset, dict) and required_keys.issubset(preset) + ] + return valid_presets, path, None + except Exception as exc: + return [], path, str(exc) + + return [], None, None + def _enqueue_task( task_type: str, @@ -775,345 +848,79 @@ def download_scenarios( -def _task_type_label(task_type: str) -> str: - """Human-readable label for task type.""" - labels = { - "download_results": "Download results", - "download_scenarios": "Download scenarios", - "run_eval_dirs": "Run eval dirs", - "generate_summary_csv": "Generate summary CSV", - "build_parquet": "Build parquet", - } - return labels.get(task_type, task_type or "Task") - - -def _task_summary(t: Dict[str, Any]) -> str: - """One-line summary from task parameters (job_id, output_path, etc.).""" - params = t.get("parameters") or {} - task_type = t.get("type", "") - if task_type == "download_results": - out = params.get("output_path") or params.get("job_id") or "" - return f"job_id={params.get('job_id', '')} → {out}" - if task_type == "download_scenarios": - out = params.get("output_dir") or params.get("output_path") or "" - return f"job_id={params.get('job_id', '')} → {out}" - if task_type in ("run_eval_dirs", "generate_summary_csv"): - return params.get("eval_root", "") - if task_type == "build_parquet": - return params.get("pkl_dir", "") - return "" - - -def _task_time_str(t: Dict[str, Any]) -> str: - """Format task created_at for display in JST (e.g. 'Feb 24, 16:45').""" - created = t.get("created_at") - dt = _to_jst(created) if created else None - if not dt: - return "—" - try: - return dt.strftime("%b %d, %H:%M") - except Exception: - return str(created)[:16] if created else "—" - - -def _task_duration(t: Dict[str, Any]) -> Optional[str]: - """Format duration from created_at to updated_at if both exist.""" - created = t.get("created_at") - updated = t.get("updated_at") - if not created or not updated: - return None - try: - start = created.timestamp() if hasattr(created, "timestamp") else None - end = updated.timestamp() if hasattr(updated, "timestamp") else None - if start is None or end is None: - return None - secs = int(end - start) - if secs < 60: - return f"{secs}s" - if secs < 3600: - return f"{secs // 60}m {secs % 60}s" - return f"{secs // 3600}h {(secs % 3600) // 60}m" - except Exception: - return None - - -def _render_summary_table(rows: Optional[List[Dict[str, Any]]]) -> None: - """Render a summary table from rows (e.g. Scenario Name, Scenario ID, Status) when present.""" - if not rows: - return - try: - df = pd.DataFrame(rows) - st.subheader("Download Status") - st.dataframe(df, width="stretch") - except Exception: - pass - - -def _render_result_summary(summary: Dict[str, Any]) -> None: - """Render a result summary block (like local mode) from task result_summary JSON.""" - job = summary.get("job", "") - if job == "download_results": - total = summary.get("total", 0) - success = summary.get("success", 0) - failed = summary.get("failed", 0) - out = summary.get("output_path", "") - st.subheader("Summary") - st.write(f"- Total scenarios processed: **{total}**") - st.write(f"- Successfully downloaded: **{success}**") - if failed: - st.write(f"- Failed: **{failed}**") - st.write(f"- Output directory: `{out}`") - if success > 0: - st.info("To generate the final summary CSV files, go to the **Eval Results** tab and run the evaluation.") - _render_summary_table(summary.get("rows")) - elif job == "download_scenarios": - total = summary.get("total", 0) - success = summary.get("success", 0) - failed = summary.get("failed", 0) - out = summary.get("output_path", "") - st.subheader("Summary") - st.write(f"- Total scenarios: **{total}**") - st.write(f"- Successfully downloaded: **{success}**") - if failed: - st.write(f"- Failed: **{failed}**") - st.write(f"- Result JSON files: **{total}** downloaded.") - st.write(f"- Output directory: `{out}`") - if success > 0: - st.info("To generate summary CSV files, go to the **Eval Results** tab and run the evaluation.") - _render_summary_table(summary.get("rows")) - elif job == "run_eval_dirs": - dirs = summary.get("directories_processed", 0) - path = summary.get("summary_path", "") - srows = summary.get("summary_rows", 0) - scrows = summary.get("score_rows", 0) - st.subheader("Eval Summary") - st.write(f"- Directories processed: **{dirs}**") - st.write(f"- Generated Summary.csv (**{srows}** rows) and Score.csv (**{scrows}** rows) in `{path}`") - elif job == "generate_summary_csv": - path = summary.get("summary_path", "") - srows = summary.get("summary_rows", 0) - scrows = summary.get("score_rows", 0) - st.subheader("Summary") - st.write(f"- Generated Summary.csv (**{srows}** rows) and Score.csv (**{scrows}** rows) in `{path}`") - elif job == "build_parquet": - path = summary.get("output_path", "") - st.subheader("Summary") - st.write(f"- Output: `{path}`") - else: - st.json(summary) - - -def _render_task_detail_content(t: Dict[str, Any]) -> None: - """Render full task detail (summary, path, error, log, params) into current container.""" - try: - _render_task_detail_content_impl(t) - except Exception as e: - st.error(f"Could not load task details: {e}") - import traceback - st.code(traceback.format_exc(), language=None) - - -def _render_task_detail_content_impl(t: Dict[str, Any]) -> None: - """Implementation of task detail rendering (called inside try/except).""" - status = t.get("status", "") - created_jst = _to_jst(t.get("created_at")) - updated_jst = _to_jst(t.get("updated_at")) - time_parts = [] - if created_jst: - try: - time_parts.append(f"Created: {created_jst.strftime('%Y-%m-%d %H:%M:%S')} JST") - except Exception: - time_parts.append(f"Created: {t.get('created_at')}") - if updated_jst and updated_jst != created_jst: - try: - time_parts.append(f"Updated: {updated_jst.strftime('%Y-%m-%d %H:%M:%S')} JST") - except Exception: - time_parts.append(f"Updated: {t.get('updated_at')}") - if time_parts: - st.caption(" · ".join(time_parts)) - result_summary_raw = t.get("result_summary") - if result_summary_raw: - try: - result_summary = json.loads(result_summary_raw) if isinstance(result_summary_raw, str) else result_summary_raw - _render_result_summary(result_summary) - st.markdown("---") - except (TypeError, ValueError): - pass - if t.get("result_path"): - st.text_input("Result path", value=t["result_path"], key=f"rp_modal_{str(t.get('id'))}", disabled=True, label_visibility="collapsed") - if status == "failed" and t.get("error_message"): - st.error(t.get("error_message")) - log_output = (t.get("log_output") or "").strip() - if log_output: - st.caption("Log output") - st.code(log_output, language=None) - params = t.get("parameters") or {} - if params: - st.caption("Parameters") - st.json(params) - - -def _open_task_detail(task_id: str) -> None: - st.session_state["_task_detail_id"] = str(task_id) - - -def _render_one_task_row( - t: Dict[str, Any], - current_user: Optional[str], - use_dialog: bool, - *, - mode: TaskCardMode, -) -> None: - """One task: compact card + View/Delete (and inline More when no dialog).""" - task_id = t.get("id", "") - task_type = t.get("type", "") - status = t.get("status", "") - status_labels = {"pending": "Pending", "running": "Running", "completed": "Completed", "failed": "Failed"} - status_label = status_labels.get(status, status) - type_label = _task_type_label(task_type) - summary = _task_summary(t) - duration = _task_duration(t) or "—" - time_str = _task_time_str(t) - sid = str(task_id) - if mode == "history": - summary_short = (summary[:72] + "…") if summary and len(summary) > 72 else (summary or "—") - else: - summary_short = "—" - progress_msg = (t.get("progress_message") or "").strip() - _card = task_list_card_markup( - task_id=sid, - type_label=type_label, - status=status, - status_label=status_label, - time_str=time_str, - duration=duration, - summary_short=summary_short, - progress_pct=t.get("progress_pct"), - progress_message=progress_msg, - mode=mode, - ) - st.markdown(f'
{_card}
', unsafe_allow_html=True) - - if use_dialog: - bv, bd, _sp = st.columns([1.15, 1.15, 4]) - with bv: - st.button("View", key=f"view_{sid}", on_click=_open_task_detail, args=(sid,)) - with bd: - _stop_lbl = "Stop" if status in ("pending", "running") else "Remove" - _stop_help = ( - "Cancels the Redis/RQ job when possible, then removes this row from the list." - if status in ("pending", "running") - else "Remove this row from the task list." - ) - if st.button( - _stop_lbl, - key=f"del_{sid}", - type="secondary", - help=_stop_help, - ): - delete_task(sid, session_id=current_user) - st.rerun() - else: - bd, _sp = st.columns([1.15, 4]) - with bd: - _stop_lbl = "Stop" if status in ("pending", "running") else "Remove" - _stop_help = ( - "Cancels the Redis/RQ job when possible, then removes this row from the list." - if status in ("pending", "running") - else "Remove this row from the task list." - ) - if st.button( - _stop_lbl, - key=f"del_{sid}", - type="secondary", - help=_stop_help, - ): - delete_task(sid, session_id=current_user) - st.rerun() - - if not use_dialog: - with st.expander("More", expanded=False): - _render_task_detail_content(t) - - -def _render_task_list(tasks: List[Dict[str, Any]], current_user: Optional[str]) -> bool: - """Active tasks visible; completed/failed in a collapsed expander. True if any active.""" - if current_user: - st.caption(f"Logged in as **{current_user}** · your recent tasks only") - if not tasks: - render_task_list_empty_state() - return False - - active = [t for t in tasks if t.get("status") in ("pending", "running")] - history = [t for t in tasks if t.get("status") not in ("pending", "running")] - use_dialog = callable(getattr(st, "dialog", None)) - - for t in active: - _render_one_task_row(t, current_user, use_dialog, mode="active_compact") - - if not active: - st.caption("No queued or running jobs.") - - if history: - with st.expander(f"Task history ({len(history)})", expanded=False): - for t in history: - _render_one_task_row(t, current_user, use_dialog, mode="history") - - # Modal for task detail when dialog is available - if use_dialog and st.session_state.get("_task_detail_id"): - _task_id = st.session_state["_task_detail_id"] - try: - detail_task = next((x for x in tasks if str(x.get("id")) == _task_id), None) - if detail_task is None: - detail_task = get_task(_task_id) - if detail_task: - - @st.dialog("Task details", width="large") - def _task_detail_modal(): - _render_task_detail_content(detail_task) - if st.button("Close"): - st.session_state.pop("_task_detail_id", None) - st.rerun() - - _task_detail_modal() - except Exception as e: - st.error(f"Could not open task details: {e}") - finally: - # Clear so X/outside click or error doesn't leave page stuck; next run shows main content - st.session_state.pop("_task_detail_id", None) - - return len(active) > 0 - - # Task queue status (production deployment); per-user when auth is enabled _current_user = None if is_task_queue_enabled(): - _current_user = get_current_user_id() if is_auth_enabled() else None + _current_user = get_task_list_current_user() render_download_task_section_header( since_days=_TASK_LIST_SINCE_DAYS, max_rows=_TASK_LIST_MAX_ROWS, ) + if "download_task_history_range" not in st.session_state: + st.session_state["download_task_history_range"] = "7 days" + if "download_task_history_page_size" not in st.session_state: + st.session_state["download_task_history_page_size"] = 20 + if "download_task_history_page" not in st.session_state: + st.session_state["download_task_history_page"] = 1 + + _control_cols = st.columns([1.3, 1.0, 1.0, 2.7]) + with _control_cols[0]: + _selected_range = st.selectbox( + "Task history range", + options=list(_TASK_HISTORY_RANGE_OPTIONS.keys()), + key="download_task_history_range", + ) + with _control_cols[1]: + _page_size = int( + st.selectbox( + "Task rows", + options=[20, 50, 100], + key="download_task_history_page_size", + ) + ) + _since_days = _TASK_HISTORY_RANGE_OPTIONS.get(_selected_range, _TASK_LIST_SINCE_DAYS) + _total_tasks = count_recent_tasks(session_id=_current_user, since_days=_since_days) + _page_count = max(1, (_total_tasks + _page_size - 1) // _page_size) if _total_tasks else 1 + _current_page = min(max(1, int(st.session_state.get("download_task_history_page", 1))), _page_count) + st.session_state["download_task_history_page"] = _current_page + with _control_cols[2]: + _selected_page = st.selectbox( + "Task page", + options=list(range(1, _page_count + 1)), + index=_current_page - 1, + key="download_task_history_page_select", + ) + if int(_selected_page) != _current_page: + _current_page = int(_selected_page) + st.session_state["download_task_history_page"] = _current_page + with _control_cols[3]: + _range_label = _selected_range if _since_days is not None else "all time" + st.caption(f"Showing **{_total_tasks}** tasks across **{_page_count}** page(s) for **{_range_label}**.") + + _offset = (_current_page - 1) * _page_size _use_fragment = getattr(st, "fragment", None) is not None if _use_fragment: try: @st.fragment(run_every=timedelta(seconds=3)) def _task_list_poll(): _t = list_recent_tasks( - limit=_TASK_LIST_MAX_ROWS, + limit=_page_size, + offset=_offset, session_id=_current_user, - since_days=_TASK_LIST_SINCE_DAYS, + since_days=_since_days, ) - _render_task_list(_t, _current_user) + render_task_list(_t, _current_user) _task_list_poll() except (TypeError, AttributeError): _use_fragment = False if not _use_fragment: tasks = list_recent_tasks( - limit=_TASK_LIST_MAX_ROWS, + limit=_page_size, + offset=_offset, session_id=_current_user, - since_days=_TASK_LIST_SINCE_DAYS, + since_days=_since_days, ) - has_active = _render_task_list(tasks, _current_user) + has_active = render_task_list(tasks, _current_user) if st.button("Refresh task list", key="refresh_tasks"): st.rerun() if has_active: @@ -1136,6 +943,1559 @@ def _run_eval_result_worker(result_dir: str, overwrite: bool) -> Dict[str, Any]: return run_eval_result_for_dir(result_dir, overwrite=overwrite) +def _parse_api_dt(value: Any) -> Optional[datetime]: + """Parse evaluator API timestamps into timezone-aware datetimes.""" + if value is None: + return None + if isinstance(value, datetime): + if getattr(value, "tzinfo", None) is None: + return value.replace(tzinfo=timezone.utc) + return value + try: + text = str(value).strip() + if not text: + return None + if text.endswith("Z"): + text = text[:-1] + "+00:00" + dt = datetime.fromisoformat(text) + if getattr(dt, "tzinfo", None) is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + except Exception: + return None + + +def _format_jst_time(value: Any, *, include_seconds: bool = False) -> str: + """Format timestamps for display in JST.""" + dt = _to_jst(_parse_api_dt(value)) + if not dt: + return "—" + return dt.strftime("%Y-%m-%d %H:%M:%S JST" if include_seconds else "%Y-%m-%d %H:%M JST") + + +def _format_jst_time_compact(value: Any) -> str: + """Compact timestamp for dense recent-job rows.""" + dt = _to_jst(_parse_api_dt(value)) + if not dt: + return "—" + return dt.strftime("%m-%d %H:%M") + + +def _format_jst_time_title(value: Any) -> str: + """Readable timestamp for fallback job titles.""" + dt = _to_jst(_parse_api_dt(value)) + if not dt: + return "unknown time" + return f"{dt.year}/{dt.month}/{dt.day} {dt.hour}:{dt.minute:02d}:{dt.second:02d}" + + +def _format_relative_time(value: Any) -> str: + """Human-friendly age/duration from a timestamp until now.""" + dt = _parse_api_dt(value) + if not dt: + return "—" + now = datetime.now(timezone.utc) + secs = max(0, int((now - dt.astimezone(timezone.utc)).total_seconds())) + if secs < 60: + return f"{secs}s ago" + if secs < 3600: + return f"{secs // 60}m ago" + if secs < 86400: + return f"{secs // 3600}h ago" + return f"{secs // 86400}d ago" + + +def _format_duration(start_value: Any, end_value: Any) -> str: + """Format elapsed duration between two evaluator timestamps.""" + start = _parse_api_dt(start_value) + end = _parse_api_dt(end_value) + if not start or not end: + return "—" + secs = max(0, int((end - start).total_seconds())) + if secs < 60: + return f"{secs}s" + if secs < 3600: + return f"{secs // 60}m {secs % 60}s" + return f"{secs // 3600}h {(secs % 3600) // 60}m" + + +def _extract_git_target(report: Dict[str, Any]) -> str: + """Return a compact branch/tag label from evaluator job report metadata.""" + source = ((report.get("event") or {}).get("source") or {}) + git_ref = str(source.get("git_ref") or "").strip() + if git_ref.startswith("refs/heads/"): + return git_ref[len("refs/heads/"):] + if git_ref.startswith("refs/tags/"): + return git_ref[len("refs/tags/"):] + return git_ref or str(source.get("git_sha") or "").strip()[:12] or "—" + + +def _extract_catalog_url(report: Dict[str, Any]) -> str: + """Return a best-effort catalog URL for linking from recent evaluator jobs.""" + catalog = report.get("catalog") or {} + direct_url = str( + catalog.get("web_url") + or catalog.get("url") + or catalog.get("catalog_url") + or "" + ).strip() + if direct_url: + return direct_url + + project_id = str(report.get("project_id") or "").strip() + catalog_id = str( + catalog.get("catalog_id") + or catalog.get("id") + or "" + ).strip() + if project_id and catalog_id: + return f"https://evaluation.tier4.jp/evaluation/vehicle_catalogs/{catalog_id}?project_id={project_id}" + return "" + + +def _extract_job_title(report: Dict[str, Any]) -> str: + """Prefer evaluator description for display title, with a readable fallback.""" + description = str(report.get("description") or "").strip() + if description: + return description + started_like = report.get("started_at") or report.get("scheduled_at") or report.get("finished_at") + return f"no description (Started at {_format_jst_time_title(started_like)})" + + +def _extract_case_totals(report: Dict[str, Any]) -> Dict[str, int]: + """Return total/success/failed/canceled counts from job report.""" + test = report.get("test") or {} + result = test.get("available_case_results") or test.get("case_results") or {} + return { + "total": int(result.get("total_count", 0) or 0), + "success": int(result.get("success_count", 0) or 0), + "failed": int(result.get("failure_count", 0) or 0), + "canceled": int(result.get("cancellation_count", 0) or 0), + } + + +def _extract_failed_case_rows(case_reports: List[Dict[str, Any]], *, limit: int = 50) -> List[Dict[str, Any]]: + """Normalize failed case rows for display tables.""" + rows: List[Dict[str, Any]] = [] + for report in case_reports: + status = str(report.get("status") or "").strip().lower() + result_status = str(((report.get("result") or {}).get("status") or "")).strip().lower() + if status not in evaluator_api.FAILED_JOB_STATUSES and result_status not in evaluator_api.FAILED_JOB_STATUSES: + continue + logs = report.get("logs") or {} + rows.append( + { + "Suite": ((report.get("suite") or {}).get("display_name") or ""), + "Scenario": ((report.get("scenario") or {}).get("display_name") or ""), + "Status": report.get("status", ""), + "Fail message": report.get("fail_message", ""), + "Cause": ", ".join(report.get("failure_cause_labels", []) or []), + "Archive log": "yes" if ((logs.get("simulation_archive") or {}).get("id")) else "no", + "Result JSON": "yes" if ((logs.get("simulation_result_json") or {}).get("id")) else "no", + } + ) + rows.sort(key=lambda row: (row["Suite"], row["Scenario"], row["Fail message"])) + return rows[:limit] + + +def _extract_suite_rows(suite_rows: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Normalize suite summary rows for display tables.""" + rows = [ + { + "Suite": row.get("name", ""), + "Total": int(row.get("all", 0) or 0), + "Success": int(row.get("success", 0) or 0), + "Failed": int(row.get("fail", 0) or 0), + "Canceled": int(row.get("cancel", 0) or 0), + "Simulation": row.get("simulation", ""), + "Report": row.get("url", ""), + } + for row in suite_rows or [] + ] + rows.sort(key=lambda row: (-row["Failed"], row["Suite"])) + return rows + + +def _extract_suite_selection_options(suite_rows: List[Dict[str, Any]]) -> List[Dict[str, str]]: + """Build suite picker options from evaluator suite summary rows.""" + options: List[Dict[str, str]] = [] + seen_ids = set() + for row in suite_rows or []: + report_url = str(row.get("url") or row.get("Report") or "").strip() + suite_id = "" + if "/tests/" in report_url: + tail = report_url.split("/tests/", 1)[1] + suite_id = tail.split("?", 1)[0].split("/", 1)[0].strip() + if not suite_id or suite_id in seen_ids: + continue + seen_ids.add(suite_id) + suite_name = str(row.get("name") or row.get("Suite") or suite_id).strip() + options.append({"id": suite_id, "label": f"{suite_name} ({suite_id})"}) + return options + + +def _status_color_variant(status: str) -> str: + """Map evaluator status to a style token used by the recent-job cards.""" + normalized = evaluator_api.normalize_job_status(status) + if normalized in evaluator_api.SUCCESS_JOB_STATUSES: + return "success" + if normalized in ("canceled", "cancelled", "aborted"): + return "canceled" + if normalized in evaluator_api.FAILED_JOB_STATUSES: + return "failed" + if normalized in ("started", "running", "pending", "queued", "created"): + return "running" + return "unknown" + + +def _status_display_label(status: str) -> str: + """Short status label for compact list rows.""" + normalized = evaluator_api.normalize_job_status(status) + if normalized in ("succeeded", "success"): + return "success" + if normalized in ("failed", "failure", "error"): + return "failed" + if normalized in ("canceled", "cancelled", "aborted"): + return "canceled" + if normalized in ("started", "running"): + return "running" + if normalized in ("pending", "queued", "created"): + return "queued" + return normalized or "unknown" + + +def _status_filter_values(selected_statuses: List[str]) -> List[str]: + """Normalize UI status filters into API status values.""" + values: List[str] = [] + for raw in selected_statuses: + normalized = evaluator_api.normalize_job_status(raw) + if normalized == "unknown" or not normalized: + continue + if normalized == "running": + values.extend(["running", "started"]) + elif normalized == "success": + values.extend(["success", "succeeded"]) + elif normalized == "failed": + values.extend(["failed", "failure", "error"]) + elif normalized == "canceled": + values.extend(["canceled", "cancelled", "aborted"]) + else: + values.append(normalized) + return sorted(set(values)) + + +def _escape_search_match_value(value: str) -> str: + """Escape wildcard characters for API Match filters.""" + return ( + value.replace("\\", "\\\\") + .replace("*", "\\*") + .replace("?", "\\?") + ) + + +def _build_recent_job_search_filter( + search_text: str, + search_scope: str, + user_directory: Optional[Dict[str, Dict[str, str]]] = None, +) -> tuple[Optional[Dict[str, Any]], str]: + """Map quick-search UI to one server-side filter and a client-side needle.""" + needle = search_text.strip() + if not needle: + return None, "" + + if search_scope == "Branch/tag": + return ( + { + "field": "event.source.git_ref", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + if search_scope == "Description": + return ( + { + "field": "description", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + if search_scope == "Job ID": + return ( + { + "field": "job_id", + "operator": "In", + "values": [needle], + }, + needle.lower(), + ) + if search_scope == "Git SHA": + return ( + { + "field": "event.source.git_sha", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + if search_scope == "Fail message": + return ( + { + "field": "fail_message", + "operator": "Match", + "values": [f"*{_escape_search_match_value(needle)}*"], + }, + needle.lower(), + ) + return None, needle.lower() + + +def _recent_job_search_history_key(scope: str) -> str: + return f"recent_eval_jobs_search_history::{scope}" + + +def _get_recent_job_search_history(scope: str) -> List[str]: + stored = get_config_value(_recent_job_search_history_key(scope), []) or [] + if not isinstance(stored, list): + return [] + return [str(v).strip() for v in stored if str(v).strip()] + + +def _save_recent_job_search_history(scope: str, value: str, *, max_items: int = 8) -> None: + text = str(value).strip() + if not text: + return + history = _get_recent_job_search_history(scope) + updated = [text] + [item for item in history if item != text] + set_config_value(_recent_job_search_history_key(scope), updated[:max_items]) + + +def _get_recent_eval_user_directory() -> Dict[str, Dict[str, str]]: + stored = get_config_value("recent_eval_jobs_user_directory", {}) or {} + if not isinstance(stored, dict): + return {} + normalized: Dict[str, Dict[str, str]] = {} + for subject_id, info in stored.items(): + if not isinstance(info, dict): + continue + normalized[str(subject_id)] = { + "name": str(info.get("name") or "").strip(), + "email": str(info.get("email") or "").strip(), + "subject_id": str(info.get("subject_id") or subject_id).strip(), + } + return normalized + + +def _save_recent_eval_user_directory(directory: Dict[str, Dict[str, str]]) -> None: + set_config_value("recent_eval_jobs_user_directory", directory) + + +@st.cache_data(ttl=24 * 3600, show_spinner=False) +def _fetch_auth_member_profile(subject_id: str, environment: str) -> Dict[str, str]: + subject = str(subject_id or "").strip() + if not subject: + return {} + org_id = os.environ.get( + "WEBAUTO_ORGANIZATION_ID", + "5a21621d-6968-4f7d-94f8-99cfb77b6e71", + ).strip() + if not org_id: + return {"subject_id": subject, "name": subject, "email": ""} + os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT + from webautoauth.token import HttpService, TokenSource, load_config + + config = load_config() + token_source = TokenSource(HttpService(config)) + access_token = token_source.get_token().access_token + quoted_subject = urllib.parse.quote(subject, safe="") + url = f"https://auth.web.auto/v2/organizations/{org_id}/members/{quoted_subject}" + response = requests.get( + url, + headers={"Authorization": f"Bearer {access_token}", "accept": "application/json"}, + timeout=10, + ) + response.raise_for_status() + data = response.json() + return { + "subject_id": str(data.get("subject_id") or subject), + "name": str(data.get("name") or subject).strip(), + "email": str(data.get("email") or "").strip(), + } + + +def _hydrate_recent_eval_user_directory( + jobs: List[Dict[str, Any]], + environment: str, +) -> Dict[str, Dict[str, str]]: + directory = _get_recent_eval_user_directory() + unresolved = sorted( + { + str(job.get("scheduled_by") or "").strip() + for job in jobs + if str(job.get("scheduled_by") or "").strip() + and str(job.get("scheduled_by") or "").strip() not in directory + } + ) + if not unresolved: + return directory + + updates: Dict[str, Dict[str, str]] = {} + with ThreadPoolExecutor(max_workers=min(6, len(unresolved))) as executor: + future_map = { + executor.submit(_fetch_auth_member_profile, subject_id, environment): subject_id + for subject_id in unresolved + } + for future in as_completed(future_map): + subject_id = future_map[future] + try: + profile = future.result() + except Exception: + profile = { + "subject_id": subject_id, + "name": subject_id, + "email": "", + } + updates[subject_id] = { + "subject_id": str(profile.get("subject_id") or subject_id).strip(), + "name": str(profile.get("name") or subject_id).strip(), + "email": str(profile.get("email") or "").strip(), + } + + if updates: + directory = {**directory, **updates} + _save_recent_eval_user_directory(directory) + return directory + + +def _build_recent_job_date_filters( + date_from: Optional[datetime.date], + date_to: Optional[datetime.date], +) -> List[Dict[str, Any]]: + """Build scheduled_at date-range filters for the search API.""" + filters: List[Dict[str, Any]] = [] + if date_from: + start_dt = datetime(date_from.year, date_from.month, date_from.day, 0, 0, 0, tzinfo=_JST) + filters.append( + { + "field": "scheduled_at", + "operator": "Gte", + "values": [start_dt.astimezone(timezone.utc).isoformat()], + } + ) + if date_to: + end_dt = datetime(date_to.year, date_to.month, date_to.day, 23, 59, 59, tzinfo=_JST) + filters.append( + { + "field": "scheduled_at", + "operator": "Lte", + "values": [end_dt.astimezone(timezone.utc).isoformat()], + } + ) + return filters + + +def _summarize_recent_job(report: Dict[str, Any]) -> Dict[str, Any]: + """Compact summary for one evaluator job card.""" + status = evaluator_api.extract_job_status(report) + totals = _extract_case_totals(report) + source = ((report.get("event") or {}).get("source") or {}) + git_url = str(source.get("git_web_url") or source.get("git_url") or "").strip() + source_repo_label = git_url.rstrip("/").split("/")[-1] if git_url else "—" + git_ref_label = _extract_git_target(report) + return { + "job_id": report.get("job_id") or report.get("id") or "", + "title": _extract_job_title(report), + "status": status, + "status_variant": _status_color_variant(status), + "build_status": ((report.get("build") or {}).get("status") or ""), + "test_status": ((report.get("test") or {}).get("status") or ""), + "target": git_ref_label, + "catalog": ((report.get("catalog") or {}).get("display_name") or ""), + "catalog_url": _extract_catalog_url(report), + "description": report.get("description", ""), + "source_label": git_ref_label, + "source_repo_label": source_repo_label, + "scheduled_at": report.get("scheduled_at"), + "started_at": report.get("started_at"), + "finished_at": report.get("finished_at"), + "duration": _format_duration(report.get("started_at"), report.get("finished_at")), + "created_label": _format_relative_time(report.get("scheduled_at") or report.get("started_at")), + "scheduled_by": str(report.get("scheduled_by") or ""), + "report_url": evaluator_api.get_job_report_url(report.get("project_id", ""), report.get("job_id") or report.get("id") or ""), + "fail_message": report.get("fail_message", ""), + "total": totals["total"], + "success": totals["success"], + "failed": totals["failed"], + "canceled": totals["canceled"], + "git_sha": str(source.get("git_sha") or "")[:12], + "git_ref_url": source.get("git_ref_url", ""), + "git_commit_url": source.get("git_commit_url", ""), + "source_url": git_url, + } + + +@st.cache_data(ttl=30, show_spinner=False) +def _fetch_recent_evaluator_job_pages( + project_id: str, + environment: str, + page_size: int, + pages_to_fetch: int, + status_values: tuple[str, ...] = (), + extra_filters: tuple[tuple[str, str, tuple[Any, ...]], ...] = (), +) -> List[Dict[str, Any]]: + """Fetch recent evaluator jobs from the search endpoint page-by-page.""" + if not project_id: + return [] + os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT + api = evaluator_api.EvaluationRunAPI() + filters: List[Dict[str, Any]] = [] + if status_values: + filters.append( + { + "field": "status", + "operator": "In", + "values": list(status_values), + } + ) + for field, operator, values in extra_filters: + filters.append( + { + "field": field, + "operator": operator, + "values": list(values), + } + ) + next_token = "" + pages: List[Dict[str, Any]] = [] + for _ in range(max(1, int(pages_to_fetch))): + data = api.search_report_list( + project_id, + filters=filters or None, + next_token=next_token, + size=max(1, min(int(page_size), 100)), + ) + reports = data.get("reports", []) or [] + pages.append( + { + "jobs": [_summarize_recent_job(report) for report in reports], + "next_token": data.get("next_token", "") or "", + } + ) + next_token = data.get("next_token", "") or "" + if not next_token: + break + return pages + + +@st.cache_data(ttl=30, show_spinner=False) +def _fetch_evaluator_job_detail(project_id: str, environment: str, job_id: str) -> Dict[str, Any]: + """Fetch deep evaluator detail for one job on demand.""" + if not project_id or not job_id: + return {} + os.environ["AUTH_PROFILE"] = environment or ENVIRONMENT + api = evaluator_api.EvaluationRunAPI() + report = api.get_job_report(project_id, job_id) + suite_rows = api.get_suite_summary(project_id, job_id, use_available_case_results=True) + case_reports = api.get_case_reports(project_id, job_id) + summary = _summarize_recent_job(report) + return { + **summary, + "suite_rows": _extract_suite_rows(suite_rows), + "failed_case_rows": _extract_failed_case_rows(case_reports), + "raw_report": report, + } + + +def _inject_recent_evaluator_jobs_styles() -> None: + """Task-adjacent styles for the recent evaluator jobs section.""" + st.markdown( + """ + + """, + unsafe_allow_html=True, + ) + + +def _render_recent_evaluator_job_card(job: Dict[str, Any], *, user_label: str = "Unknown") -> None: + """Render one recent evaluator job as a single-row list item.""" + variant = html.escape(job.get("status_variant", "unknown")) + status = html.escape(_status_display_label(job.get("status", "unknown") or "unknown")) + title_text = html.escape(job.get("title", "—")) + description = html.escape(job.get("description", "") or "") + catalog = html.escape(job.get("catalog", "") or "—") + catalog_url = html.escape(job.get("catalog_url", "") or "") + scheduled = html.escape(_format_jst_time_compact(job.get("scheduled_at"))) + duration = html.escape(job.get("duration", "—")) + job_id = html.escape(str(job.get("job_id", ""))) + build_status = html.escape(job.get("build_status", "") or "—") + test_status = html.escape(job.get("test_status", "") or "—") + created_label = html.escape(job.get("created_label", "—")) + git_sha = html.escape(job.get("git_sha", "") or "—") + source_label = html.escape(job.get("source_label", "") or "—") + user_text = html.escape(user_label or "Unknown") + report_url = html.escape(job.get("report_url", "") or "") + source_url = html.escape(job.get("git_ref_url", "") or job.get("source_url", "") or "") + status_variant = job.get("status_variant", "unknown") + status_mark = { + "running": '', + "success": '', + "failed": '', + "canceled": '', + }.get(status_variant, '') + meta_line = job_id + counts = ( + f'S {int(job.get("success", 0))} · ' + f'F {int(job.get("failed", 0))} · ' + f'C {int(job.get("canceled", 0))} / ' + f'{int(job.get("total", 0))}' + ) + title_html = f'{title_text}' if report_url else title_text + source_html = ( + f'{source_label}' + if source_url else source_label + ) + catalog_html = ( + f'{catalog}' + if catalog_url else catalog + ) + st.markdown( + f""" +
+
+
+
{title_html}
+
{meta_line}
+
+
+ {status_mark}{status} +
+
+ {scheduled}
{duration} · {created_label} +
+
+ {catalog_html}
{source_html} +
+
+ {user_text} +
+
+ build {build_status} · test {test_status} · {git_sha}
+ {counts} +
+
+
+ """, + unsafe_allow_html=True, + ) + + +def _render_recent_evaluator_job_detail(project_id: str, environment: str, job: Dict[str, Any]) -> None: + """Render detailed evaluator-job information inside an expander.""" + job_id = str(job.get("job_id", "") or "") + if not job_id: + st.warning("Missing job id.") + return + try: + detail = _fetch_evaluator_job_detail(project_id, environment, job_id) + except Exception as e: + st.error(f"Could not fetch evaluator details: {e}") + return + + st.markdown("**Overview**") + top_cols = st.columns(4) + top_cols[0].metric("Total", int(detail.get("total", 0))) + top_cols[1].metric("Success", int(detail.get("success", 0))) + top_cols[2].metric("Failed", int(detail.get("failed", 0))) + top_cols[3].metric("Canceled", int(detail.get("canceled", 0))) + + overview_left, overview_right = st.columns([1.3, 1.1]) + with overview_left: + st.write(f"Status: `{detail.get('status', 'unknown')}`") + st.write(f"Title: `{detail.get('title', '—')}`") + st.write(f"Build/Test: `{detail.get('build_status', '—')}` / `{detail.get('test_status', '—')}`") + st.write(f"Ref: `{detail.get('target', '—')}`") + st.write(f"Catalog: `{detail.get('catalog', '—')}`") + st.write(f"Repo: `{detail.get('source_repo_label', '—')}`") + with overview_right: + st.write(f"Scheduled: `{_format_jst_time(detail.get('scheduled_at'), include_seconds=True)}`") + st.write(f"Started: `{_format_jst_time(detail.get('started_at'), include_seconds=True)}`") + st.write(f"Finished: `{_format_jst_time(detail.get('finished_at'), include_seconds=True)}`") + st.write(f"Duration: `{detail.get('duration', '—')}`") + st.write(f"SHA: `{detail.get('git_sha', '—')}`") + + action_cols = st.columns([1.2, 1.2, 4]) + report_url = detail.get("report_url", "") + catalog_url = detail.get("catalog_url", "") + source_url = detail.get("source_url", "") or detail.get("git_ref_url", "") + with action_cols[0]: + if report_url: + st.link_button("Open report", report_url, use_container_width=True) + with action_cols[1]: + if catalog_url: + st.link_button("Open catalog", catalog_url, use_container_width=True) + with action_cols[2]: + if source_url: + st.link_button("Open source", source_url, use_container_width=True) + + if detail.get("fail_message"): + st.warning(detail.get("fail_message")) + + suite_rows = detail.get("suite_rows") or [] + with st.expander(f"Suites ({len(suite_rows)})", expanded=bool(suite_rows)): + if suite_rows: + st.dataframe(pd.DataFrame(suite_rows), width="stretch", hide_index=True) + else: + st.caption("No suite summary available.") + + failed_case_rows = detail.get("failed_case_rows") or [] + with st.expander(f"Failed Cases ({len(failed_case_rows)})", expanded=bool(failed_case_rows)): + if failed_case_rows: + st.dataframe(pd.DataFrame(failed_case_rows), width="stretch", hide_index=True) + else: + st.caption("No failed cases in the current report.") + + with st.expander("Raw JSON", expanded=False): + st.json(detail.get("raw_report", {})) + + +def _render_recent_evaluator_job_run_dialog( + project_id: str, + environment: str, + job: Dict[str, Any], + *, + output_path_default: str, + download_type_default: str, + phase_default: str, + skip_large_file_default: bool, + large_file_mb_default: float, + keep_zip_files_default: bool, +) -> None: + """Render the dialog used to enqueue Download + Eval + Parquet from a recent job row.""" + job_id = str(job.get("job_id", "") or "") + if not job_id: + st.error("Missing evaluator job id.") + return + + detail = _fetch_evaluator_job_detail(project_id, environment, job_id) + suite_options = _extract_suite_selection_options(detail.get("suite_rows") or []) + suite_label_to_id = {opt["label"]: opt["id"] for opt in suite_options} + suite_labels = [opt["label"] for opt in suite_options] + + st.caption("Confirm the workflow options for this evaluator job, then start a background task.") + summary_cols = st.columns([1.45, 1.15, 1.35, 1.05]) + summary_cols[0].markdown(f"**Title** \n`{detail.get('title', '—')}`") + summary_cols[1].markdown(f"**Status** \n`{detail.get('status', 'unknown')}`") + summary_cols[2].markdown(f"**Catalog** \n`{detail.get('catalog', '—')}`") + summary_cols[3].markdown(f"**Cases** \n`{int(detail.get('total', 0))}`") + + with st.form(key=f"recent_eval_run_form_{job_id}", border=False): + run_output_path = st.text_input( + "Output path", + value=output_path_default, + help="Folder under the data directory. This uses the same safe path rules as the main download workflow.", + ) + + if not suite_labels: + hint_cols = st.columns([1.2, 2.8]) + with hint_cols[0]: + if st.form_submit_button("Refresh suites", use_container_width=True): + _fetch_evaluator_job_detail.clear() + st.rerun() + with hint_cols[1]: + st.caption("No suite candidates were available yet for this job. Refresh to re-read suite data from the evaluator API.") + + selected_suite_labels = st.multiselect( + "Suites to download (optional)", + options=suite_labels, + default=[], + help="Leave empty to download all suites from this evaluator job.", + disabled=not suite_labels, + ) + + run_download_type = st.radio( + "Download type", + ["Archives (ZIP)", "Result JSON only"], + index=0 if download_type_default == "Archives (ZIP)" else 1, + horizontal=True, + ) + + run_phase = "" + run_skip_large_file = False + run_large_file_mb = 50.0 + run_keep_zip_files = False + if run_download_type == "Archives (ZIP)": + run_phase = st.text_input( + "Phase to extract", + value=phase_default, + help="Enter the phase name to extract from archives.", + ) + opt_cols = st.columns([1.2, 1.3, 1.2]) + with opt_cols[0]: + run_skip_large_file = st.checkbox( + "Skip large files", + value=skip_large_file_default, + help="Skip unusually large archives during download.", + ) + with opt_cols[1]: + run_large_file_mb = st.number_input( + "Skip threshold (MB)", + min_value=1.0, + max_value=5000.0, + step=1.0, + value=float(large_file_mb_default), + ) + with opt_cols[2]: + run_keep_zip_files = st.checkbox( + "Keep ZIP files", + value=keep_zip_files_default, + help="Keep downloaded ZIPs after extraction.", + ) + + run_cols = st.columns([1.25, 1.25, 1.1]) + with run_cols[0]: + run_eval = st.checkbox( + "Run evaluation", + value=True, + help="Run eval_result and generate Summary.csv / Score.csv after download.", + ) + with run_cols[1]: + generate_parquet = st.checkbox( + "Generate parquet", + value=CATALOG_IO_AVAILABLE, + disabled=not CATALOG_IO_AVAILABLE, + help="Build scene_result.parquet from .pkl files." if CATALOG_IO_AVAILABLE else "Install perception_catalog_analyzer to enable this.", + ) + with run_cols[2]: + eval_recursive = st.checkbox( + "Recursive eval", + value=True, + help="Search subdirectories for evaluation result folders.", + ) + + action_cols = st.columns([1.15, 1.15, 3.7]) + cancel_clicked = action_cols[0].form_submit_button("Cancel", use_container_width=True) + start_clicked = action_cols[1].form_submit_button("Start", type="primary", use_container_width=True) + + if cancel_clicked: + st.session_state.pop("recent_eval_jobs_run_selected", None) + st.rerun() + + if not start_clicked: + return + + resolved_output, path_err = resolve_under_data_root(run_output_path, allow_create=True) + if path_err: + st.error(f"Output path is invalid: {path_err}") + return + + selected_suite_ids = [suite_label_to_id[label] for label in selected_suite_labels] + resolved_path_str = str(resolved_output) + set_config_value("output_path", to_data_relative(resolved_output)) + set_config_value("environment", environment) + set_config_value("project_id", project_id) + set_config_value("job_id", job_id) + set_config_value("suite_id", "") + set_config_value("suite_ids", selected_suite_ids) + set_config_value("download_type", run_download_type) + if run_download_type == "Archives (ZIP)": + set_config_value("phase", run_phase) + set_config_value("skip_large_file", run_skip_large_file) + set_config_value("large_file_mb", run_large_file_mb) + set_config_value("keep_zip_files", run_keep_zip_files) + + params = { + "output_path": resolved_path_str, + "project_id": project_id, + "job_id": job_id, + "suite_id": "", + "suite_ids": selected_suite_ids or None, + "download_type": "archives" if run_download_type == "Archives (ZIP)" else "result_json", + "phase": run_phase if run_download_type == "Archives (ZIP)" else "", + "skip_large_file": run_skip_large_file if run_download_type == "Archives (ZIP)" else False, + "large_file_mb": run_large_file_mb if run_download_type == "Archives (ZIP)" else 50.0, + "keep_zip_files": run_keep_zip_files if run_download_type == "Archives (ZIP)" else False, + "run_eval": run_eval, + "generate_parquet": generate_parquet, + "eval_recursive": eval_recursive, + "eval_overwrite": False, + "eval_workers": _default_eval_workers(), + } + task_id = _enqueue_task("download_and_eval", params) + if not task_id: + st.error("Failed to enqueue task. Check REDIS_URL and DATABASE_URL.") + return + + st.session_state["recent_eval_jobs_flash"] = ( + f"Queued Download + Eval + Parquet for `{detail.get('title', job_id)}`. " + f"Task id: `{task_id}`." + ) + st.session_state.pop("recent_eval_jobs_run_selected", None) + st.rerun() + + +def _render_recent_evaluator_jobs_section( + project_id: str, + environment: str, + *, + output_path_default: str, + download_type_default: str, + phase_default: str, + skip_large_file_default: bool, + large_file_mb_default: float, + keep_zip_files_default: bool, +) -> None: + """Render a direct evaluator-jobs browser above the download tabs.""" + _inject_recent_evaluator_jobs_styles() + show_section = st.toggle( + "Show recent evaluator jobs", + value=st.session_state.get("recent_eval_jobs_show", False), + key="recent_eval_jobs_show", + help="Load recent evaluator jobs only when you want to browse them.", + ) + if not show_section: + return + + st.subheader("Recent evaluator jobs") + st.caption("Compact browser for recent evaluator jobs. Select one job to inspect detailed suite and failed-case information.") + flash_message = st.session_state.pop("recent_eval_jobs_flash", None) + if flash_message: + st.success(flash_message) + user_directory = _get_recent_eval_user_directory() + + control_cols = st.columns([0.75, 1.0, 1.15, 1.45, 1.25, 1.0, 1.0, 0.75]) + with control_cols[0]: + st.markdown('
Rows
', unsafe_allow_html=True) + limit = int( + st.selectbox( + "Rows", + options=[6, 12, 20, 30], + index=1, + key="recent_eval_jobs_limit", + help="How many recent evaluator jobs to fetch for this project.", + label_visibility="collapsed", + ) + ) + with control_cols[1]: + st.markdown('
Status
', unsafe_allow_html=True) + status_filter = st.multiselect( + "Status", + options=["running", "success", "failed", "canceled", "unknown"], + default=[], + key="recent_eval_jobs_status_filter", + help="Leave empty to show all recent jobs.", + label_visibility="collapsed", + placeholder="All statuses", + ) + with control_cols[2]: + st.markdown('
Search In
', unsafe_allow_html=True) + search_scope = st.selectbox( + "Search in", + options=["Branch/tag", "Description", "Job ID", "Git SHA", "Fail message"], + index=0, + key="recent_eval_jobs_search_scope", + help="Choose which evaluator field the quick search should target.", + label_visibility="collapsed", + ) + with control_cols[3]: + st.markdown('
Search
', unsafe_allow_html=True) + search_text = st.text_input( + "Search", + value=st.session_state.get("recent_eval_jobs_search_text", ""), + key="recent_eval_jobs_search_text", + help="Server-side search across the selected field.", + label_visibility="collapsed", + placeholder="Type to search evaluator jobs", + ).strip() + recent_candidates = _get_recent_job_search_history(search_scope) + selected_user_name = "" + if recent_candidates: + recent_choice = st.selectbox( + "Recent searches", + options=[""] + recent_candidates, + index=0, + key=f"recent_eval_jobs_search_recent::{search_scope}", + help="Reuse a previously entered search for this field.", + ) + if recent_choice and recent_choice != search_text: + st.session_state["recent_eval_jobs_search_text"] = recent_choice + st.rerun() + user_candidates = sorted( + { + info.get("name", "").strip() + for info in user_directory.values() + if info.get("name", "").strip() + }, + key=str.lower, + ) + with control_cols[4]: + st.markdown('
User
', unsafe_allow_html=True) + selected_user_name = st.selectbox( + "User", + options=[""] + user_candidates, + index=0, + key="recent_eval_jobs_user_filter", + help="Filter jobs by resolved scheduled user name.", + label_visibility="collapsed", + ) + with control_cols[5]: + st.markdown('
From
', unsafe_allow_html=True) + date_from = st.date_input( + "From", + value=st.session_state.get("recent_eval_jobs_date_from", None), + key="recent_eval_jobs_date_from", + label_visibility="collapsed", + help="Scheduled-at lower bound in JST.", + ) + with control_cols[6]: + st.markdown('
To
', unsafe_allow_html=True) + date_to = st.date_input( + "To", + value=st.session_state.get("recent_eval_jobs_date_to", None), + key="recent_eval_jobs_date_to", + label_visibility="collapsed", + help="Scheduled-at upper bound in JST.", + ) + with control_cols[7]: + st.markdown('
Actions
', unsafe_allow_html=True) + if st.button("Refresh", key="refresh_recent_eval_jobs", use_container_width=True): + _fetch_recent_evaluator_job_pages.clear() + _fetch_evaluator_job_detail.clear() + st.rerun() + + page_key = "recent_eval_jobs_page" + if page_key not in st.session_state: + st.session_state[page_key] = 1 + if date_from and date_to and date_from > date_to: + st.warning("`From` date must be earlier than or equal to `To` date.") + return + + def _render_job_list() -> None: + nonlocal user_directory + if not project_id: + st.info("Enter a project id in the sidebar to browse recent evaluator jobs.") + return + current_page = max(1, int(st.session_state.get(page_key, 1))) + pages_to_fetch = max(3, current_page + 2) + if search_text or status_filter or date_from or date_to or selected_user_name: + pages_to_fetch = max(pages_to_fetch, 6) + server_status_values = tuple(_status_filter_values(status_filter)) + server_search_filter, search_needle = _build_recent_job_search_filter(search_text, search_scope, user_directory) + selected_user_ids = sorted( + { + subject_id + for subject_id, info in user_directory.items() + if selected_user_name + and selected_user_name.lower() == str(info.get("name") or "").strip().lower() + } + ) + server_date_filters = _build_recent_job_date_filters(date_from, date_to) + extra_filters: List[Dict[str, Any]] = [] + if server_search_filter: + extra_filters.append(server_search_filter) + if selected_user_ids: + extra_filters.append( + { + "field": "scheduled_by", + "operator": "In", + "values": selected_user_ids, + } + ) + extra_filters.extend(server_date_filters) + extra_filter_tuples = tuple( + ( + str(f["field"]), + str(f["operator"]), + tuple(f.get("values", []) or []), + ) + for f in extra_filters + ) + try: + fetched_pages = _fetch_recent_evaluator_job_pages( + project_id, + environment, + limit, + pages_to_fetch, + status_values=server_status_values, + extra_filters=extra_filter_tuples, + ) + except Exception as e: + st.error(f"Could not fetch recent evaluator jobs: {e}") + return + if search_text: + _save_recent_job_search_history(search_scope, search_text) + + jobs = [job for page in fetched_pages for job in page.get("jobs", [])] + user_directory = _hydrate_recent_eval_user_directory(jobs, environment) + has_more_from_api = bool(fetched_pages and fetched_pages[-1].get("next_token")) + + if search_needle: + if search_scope == "Branch/tag": + jobs = [job for job in jobs if search_needle in str(job.get("target", "")).lower()] + elif search_scope == "Description": + jobs = [job for job in jobs if search_needle in str(job.get("description", "")).lower() or search_needle in str(job.get("title", "")).lower()] + elif search_scope == "Job ID": + jobs = [job for job in jobs if search_needle in str(job.get("job_id", "")).lower()] + elif search_scope == "Git SHA": + jobs = [job for job in jobs if search_needle in str(job.get("git_sha", "")).lower()] + elif search_scope == "Fail message": + jobs = [job for job in jobs if search_needle in str(job.get("fail_message", "")).lower()] + if selected_user_name: + selected_lower = selected_user_name.lower() + jobs = [ + job for job in jobs + if selected_lower == str((user_directory.get(str(job.get("scheduled_by") or "").strip(), {}) or {}).get("name", "")).strip().lower() + ] + if status_filter: + selected = {evaluator_api.normalize_job_status(v) for v in status_filter} + jobs = [job for job in jobs if job.get("status_variant") in selected or evaluator_api.normalize_job_status(job.get("status", "")) in selected] + + if not jobs: + st.session_state[page_key] = 1 + st.markdown('
No recent evaluator jobs matched the current filters.
', unsafe_allow_html=True) + return + + total_loaded = len(jobs) + has_next_page = total_loaded > current_page * limit or has_more_from_api + max_known_page = max(1, (total_loaded + limit - 1) // limit) + if current_page > max_known_page: + current_page = max_known_page + st.session_state[page_key] = current_page + start_idx = (current_page - 1) * limit + end_idx = start_idx + limit + visible_jobs = jobs[start_idx:end_idx] + if not visible_jobs and current_page > 1: + current_page = max(1, current_page - 1) + st.session_state[page_key] = current_page + start_idx = (current_page - 1) * limit + end_idx = start_idx + limit + visible_jobs = jobs[start_idx:end_idx] + has_next_page = total_loaded > current_page * limit + + if current_page == 1: + page_numbers = list(range(1, min(3, max_known_page) + 1)) + else: + page_numbers = list( + range( + max(1, current_page - 1), + min(max_known_page, current_page + 1) + 1, + ) + ) + pager_cols = st.columns([0.8, 0.9, 0.9, 0.9, 0.8, 5.7]) + with pager_cols[0]: + if st.button("‹", key="recent_eval_jobs_prev", use_container_width=True, disabled=current_page <= 1): + st.session_state[page_key] = max(1, current_page - 1) + st.rerun() + for idx, page_num in enumerate(page_numbers[:3], start=1): + with pager_cols[idx]: + btn_key = ( + f"recent_eval_jobs_pagebtn_active_{page_num}" + if page_num == current_page + else f"recent_eval_jobs_pagebtn_{page_num}" + ) + if st.button( + str(page_num), + key=btn_key, + use_container_width=True, + disabled=page_num == current_page, + ): + st.session_state[page_key] = page_num + st.rerun() + with pager_cols[4]: + if st.button("›", key="recent_eval_jobs_next", use_container_width=True, disabled=not has_next_page): + st.session_state[page_key] = current_page + 1 + st.rerun() + + selected_job_id = st.session_state.get("recent_eval_jobs_selected") + if selected_job_id and not any(str(job.get("job_id", "")) == str(selected_job_id) for job in jobs): + st.session_state.pop("recent_eval_jobs_selected", None) + selected_job_id = None + + selected_run_job_id = st.session_state.get("recent_eval_jobs_run_selected") + if selected_run_job_id and not any(str(job.get("job_id", "")) == str(selected_run_job_id) for job in jobs): + st.session_state.pop("recent_eval_jobs_run_selected", None) + selected_run_job_id = None + + st.markdown('
', unsafe_allow_html=True) + for job in visible_jobs: + subject_id = str(job.get("scheduled_by") or "").strip() + user_info = user_directory.get(subject_id, {}) + user_label = str(user_info.get("name") or subject_id or "Unknown").strip() + row_cols = st.columns([9.8, 2.0]) + with row_cols[0]: + _render_recent_evaluator_job_card(job, user_label=user_label) + with row_cols[1]: + action_cols = st.columns([1.0, 1.0], gap="small") + with action_cols[0]: + if st.button("Details", key=f"recent_eval_view_{job['job_id']}", use_container_width=True): + st.session_state["recent_eval_jobs_selected"] = str(job["job_id"]) + _fetch_evaluator_job_detail.clear() + st.rerun() + with action_cols[1]: + if st.button("Run", key=f"recent_eval_run_{job['job_id']}", use_container_width=True): + st.session_state["recent_eval_jobs_run_selected"] = str(job["job_id"]) + _fetch_evaluator_job_detail.clear() + st.rerun() + st.markdown("
", unsafe_allow_html=True) + + selected_job_id = st.session_state.get("recent_eval_jobs_selected") + if selected_job_id: + selected_job = next((job for job in jobs if str(job.get("job_id", "")) == str(selected_job_id)), None) + if selected_job: + if callable(getattr(st, "dialog", None)): + try: + @st.dialog(f"Job details · {selected_job.get('title', '—')}", width="large") + def _recent_eval_job_dialog() -> None: + _render_recent_evaluator_job_detail(project_id, environment, selected_job) + if st.button("Close", key="recent_eval_jobs_close_detail", use_container_width=True): + st.session_state.pop("recent_eval_jobs_selected", None) + st.rerun() + + _recent_eval_job_dialog() + finally: + st.session_state.pop("recent_eval_jobs_selected", None) + else: + st.markdown('
', unsafe_allow_html=True) + hdr_cols = st.columns([4.4, 1.1]) + with hdr_cols[0]: + st.subheader(f"Job details · {selected_job.get('title', '—')}") + with hdr_cols[1]: + if st.button("Close", key="recent_eval_jobs_close_detail_fallback", use_container_width=True): + st.session_state.pop("recent_eval_jobs_selected", None) + st.rerun() + _render_recent_evaluator_job_detail(project_id, environment, selected_job) + st.markdown("
", unsafe_allow_html=True) + + selected_run_job_id = st.session_state.get("recent_eval_jobs_run_selected") + if selected_run_job_id: + selected_run_job = next((job for job in jobs if str(job.get("job_id", "")) == str(selected_run_job_id)), None) + if selected_run_job: + if callable(getattr(st, "dialog", None)): + try: + @st.dialog(f"Download + Eval + Parquet · {selected_run_job.get('title', '—')}", width="large") + def _recent_eval_run_dialog() -> None: + _render_recent_evaluator_job_run_dialog( + project_id, + environment, + selected_run_job, + output_path_default=output_path_default, + download_type_default=download_type_default, + phase_default=phase_default, + skip_large_file_default=skip_large_file_default, + large_file_mb_default=large_file_mb_default, + keep_zip_files_default=keep_zip_files_default, + ) + + _recent_eval_run_dialog() + finally: + if st.session_state.get("recent_eval_jobs_run_selected") == str(selected_run_job_id): + st.session_state.pop("recent_eval_jobs_run_selected", None) + else: + st.markdown('
', unsafe_allow_html=True) + hdr_cols = st.columns([4.4, 1.1]) + with hdr_cols[0]: + st.subheader(f"Download + Eval + Parquet · {selected_run_job.get('title', '—')}") + with hdr_cols[1]: + if st.button("Close", key="recent_eval_jobs_close_run_fallback", use_container_width=True): + st.session_state.pop("recent_eval_jobs_run_selected", None) + st.rerun() + _render_recent_evaluator_job_run_dialog( + project_id, + environment, + selected_run_job, + output_path_default=output_path_default, + download_type_default=download_type_default, + phase_default=phase_default, + skip_large_file_default=skip_large_file_default, + large_file_mb_default=large_file_mb_default, + keep_zip_files_default=keep_zip_files_default, + ) + st.markdown("
", unsafe_allow_html=True) + + _render_job_list() + + # Sidebar for configuration with st.sidebar: st.header("Configuration") @@ -1330,7 +2690,6 @@ def on_suite_id_change(): skip_large_file = False large_file_mb = 50.0 # Doesn't apply - st.markdown('

Pick a workflow

', unsafe_allow_html=True) tab1, tab2, tab3, tab4 = st.tabs( ["📥 Download Results", "🗺️ Download Scenarios", "📊 View Downloads", "🧮 Eval Results"] @@ -1518,6 +2877,188 @@ def on_suite_id_change(): st.error(f"❌ Error: {str(e)}") st.exception(e) + # === Combined Download + Eval + Parquet Button === + st.divider() + st.subheader("🚀 Combined Workflow: Download + Eval + Parquet") + st.caption("Download results, run evaluation, and generate parquet in one click. Eval only runs if download succeeds.") + + # Options for combined workflow + col_combo1, col_combo2 = st.columns(2) + with col_combo1: + combined_run_eval = st.checkbox( + "Run evaluation (eval_result + Summary/Score CSV)", + value=True, + key="combined_run_eval", + help="Run eval_result on downloaded directories and generate Summary.csv/Score.csv" + ) + with col_combo2: + combined_generate_parquet = st.checkbox( + "Generate parquet", + value=CATALOG_IO_AVAILABLE, + key="combined_generate_parquet", + help="Build scene_result.parquet from .pkl files" if CATALOG_IO_AVAILABLE else "Install perception_catalog_analyzer to enable", + disabled=not CATALOG_IO_AVAILABLE, + ) + + combined_eval_recursive = st.checkbox( + "Search subdirectories for eval", + value=True, + key="combined_eval_recursive", + help="Recursively search for result directories" + ) + + if st.button("📥 Download + Eval + Parquet", type="primary", key="download_and_eval_btn"): + st.session_state.stop_downloads = False + if not all([project_id, st.session_state.job_id]): + st.error("Please fill in all required fields: Project ID and Job ID") + st.stop() + resolved_output, path_err = resolve_under_data_root(output_path, allow_create=True) + if path_err: + st.error(f"Output path is invalid: {path_err}. Use a path under the server data root.") + st.stop() + resolved_path_str = str(resolved_output) + set_config_value("output_path", to_data_relative(resolved_output)) + set_config_value("environment", environment) + set_config_value("project_id", project_id) + set_config_value("job_id", st.session_state.job_id) + set_config_value("suite_id", suite_id) + set_config_value("suite_ids", selected_suite_ids) + set_config_value("download_type", download_type) + if download_type == "Archives (ZIP)": + set_config_value("phase", phase) + set_config_value("skip_large_file", skip_large_file) + set_config_value("large_file_mb", large_file_mb) + set_config_value("keep_zip_files", keep_zip_files) + + if is_task_queue_enabled(): + # Enqueue combined task + params = { + "output_path": resolved_path_str, + "project_id": project_id, + "job_id": st.session_state.job_id, + "suite_id": suite_id or "", + "suite_ids": selected_suite_ids or None, + "download_type": "archives" if download_type == "Archives (ZIP)" else "result_json", + "phase": phase if download_type == "Archives (ZIP)" else "", + "skip_large_file": skip_large_file, + "large_file_mb": large_file_mb, + "keep_zip_files": keep_zip_files, + "run_eval": combined_run_eval, + "generate_parquet": combined_generate_parquet, + "eval_recursive": combined_eval_recursive, + "eval_overwrite": False, + "eval_workers": _default_eval_workers(), + } + task_id = _enqueue_task("download_and_eval", params) + if task_id: + st.success("Combined task queued. It will appear in the **Task status** section below; the list updates automatically.") + st.info("The task will: 1) Download results → 2) Run eval (if download succeeds) → 3) Generate parquet (if download succeeds)") + else: + st.error("Failed to enqueue task. Check REDIS_URL and DATABASE_URL.") + st.stop() + + # Inline execution (non-task-queue mode) + os.makedirs(resolved_path_str, exist_ok=True) + try: + job_result = JobResult( + environment=environment, + project_id=project_id, + job_id=st.session_state.job_id, + suite_id=suite_id, + suite_ids=selected_suite_ids, + output_path=resolved_path_str, + ) + + # Progress containers + progress_placeholder = st.empty() + status_placeholder = st.empty() + + def inline_progress(msg: str): + status_placeholder.info(msg) + + # Step 1: Download + progress_placeholder.info("📥 Step 1/3: Downloading results...") + download_successful = False + if download_type == "Archives (ZIP)": + with st.expander("Downloading Archives", expanded=True): + remain_list = job_result.download_archive_and_unzip( + phase, + skip_large_file=skip_large_file, + large_file_mb=large_file_mb, + keep_zip_files=keep_zip_files, + ) + download_successful = len(remain_list) > 0 + st.success(f"✅ Downloaded and extracted {len(remain_list)} archives") + else: + with st.expander("Downloading Result JSON", expanded=True): + log_dicts = job_result.download_result_json() + download_successful = len(log_dicts) > 0 + st.success(f"✅ Downloaded {len(log_dicts)} JSON files") + + if not download_successful: + st.error("❌ Download failed. Cannot continue with evaluation.") + st.stop() + + # Step 2: Run eval + if combined_run_eval: + progress_placeholder.info("🧮 Step 2/3: Running evaluation...") + target_dirs = find_eval_result_dirs(resolved_path_str, recursive=combined_eval_recursive) + if target_dirs: + eval_results = [] + eval_progress = st.progress(0) + for i, result_dir in enumerate(target_dirs): + eval_progress.progress((i + 1) / len(target_dirs), f"Evaluating {i+1}/{len(target_dirs)}") + eval_results.append(run_eval_result_for_dir(result_dir, overwrite=False)) + eval_progress.empty() + + success_eval = sum(1 for r in eval_results if r["status"] == "success") + failed_eval = sum(1 for r in eval_results if r["status"] == "failed") + st.success(f"✅ Eval complete: {success_eval} success, {failed_eval} failed") + + # Generate summary CSVs + with st.spinner("Generating Summary.csv and Score.csv..."): + csv_info = generate_summary_and_score_csv(resolved_path_str) + st.success(f"Generated Summary.csv ({csv_info['summary_rows']} rows) and Score.csv ({csv_info['score_rows']} rows)") + else: + st.warning("⚠️ No eval result directories found") + + # Step 3: Generate parquet + if combined_generate_parquet and CATALOG_IO_AVAILABLE: + progress_placeholder.info("📦 Step 3/3: Generating parquet...") + pkl_dir = Path(resolved_path_str) + all_pkl_files = list(pkl_dir.rglob("*.pkl")) + list(pkl_dir.rglob("*.pkl.z")) + pkl_count = len(all_pkl_files) + if pkl_count > 0: + with st.spinner(f"Processing {pkl_count} pkl files..."): + parquet_path = pkl_archive_to_parquet( + pkl_dir, + on_progress=None, + on_skip=None, + project_id=project_id, + job_id=st.session_state.job_id, + ) + st.success(f"✅ Parquet generated: {parquet_path}") + else: + st.warning("⚠️ No .pkl files found for parquet generation") + + progress_placeholder.empty() + status_placeholder.empty() + st.success("🎉 Combined workflow complete!") + + # Show file tree + with st.expander("📁 File Structure"): + for root, dirs, files in os.walk(resolved_path_str): + level = root.replace(resolved_path_str, '').count(os.sep) + indent = ' ' * 4 * level + st.text(f"{indent}{os.path.basename(root)}/") + subindent = ' ' * 4 * (level + 1) + for file in files: + st.text(f"{subindent}{file}") + + except Exception as e: + st.error(f"❌ Error: {str(e)}") + st.exception(e) + # Information section with st.expander("ℹ️ How to use"): st.markdown(""" @@ -1753,7 +3294,7 @@ def on_suite_id_change(): st.error(f"Failed to save: {e}") st.exception(e) - col1, col2, col3 = st.columns(3) + col1, col2 = st.columns(2) with col1: eval_recursive = st.checkbox( "Search subdirectories", @@ -1766,28 +3307,10 @@ def on_suite_id_change(): value=get_config_value("eval_overwrite", False), help="If unchecked, directories with result.txt will be skipped", ) - with col3: - eval_parallel = st.checkbox( - "Run in parallel", - value=get_config_value("eval_parallel", False), - help="Temporarily disabled. Parallel execution currently provides no measurable benefit.", - disabled=True - ) - if eval_parallel: - eval_workers = st.number_input( - "Eval worker threads", - min_value=1, - max_value=16, - value=get_config_value("eval_workers", 1), - help="Number of parallel threads used to run eval_result", - ) - set_config_value("eval_workers", eval_workers) - else: - eval_workers = 1 - set_config_value("eval_workers", eval_workers) + eval_workers = _default_eval_workers() + set_config_value("eval_workers", eval_workers) set_config_value("eval_recursive", eval_recursive) set_config_value("eval_overwrite", eval_overwrite) - set_config_value("eval_parallel", eval_parallel) # New option: Only generate summary/score csv only_generate_summary = st.checkbox( @@ -1885,6 +3408,7 @@ def _emit_eval_finished_notification(message: str): "eval_root": eval_path, "recursive": eval_recursive, "overwrite": eval_overwrite, + "eval_workers": eval_workers, }) if tid: enqueued.append(f"{'generate_summary_csv' if only_generate_summary else 'run_eval_dirs'} ({tid[:8]}...)") @@ -2046,30 +3570,23 @@ def _update_progress_status(done: int, total_dirs: int): ) try: - # sequential evaluation - if not eval_parallel: - for i, result_dir in enumerate(target_dirs): - _update_progress_status(i, total) - results.append(run_eval_result_for_dir(result_dir, overwrite=eval_overwrite)) - _update_progress_status(i + 1, total) - else: - max_workers = max(1, min(int(eval_workers), len(target_dirs))) - with ThreadPoolExecutor(max_workers=max_workers) as executor: - future_map = { - executor.submit(_run_eval_result_worker, result_dir, eval_overwrite): result_dir - for result_dir in target_dirs - } - completed = 0 - for future in as_completed(future_map): - completed += 1 - _update_progress_status(completed, total) - try: - results.append(future.result()) - except Exception as e: - result_dir = future_map.get(future, "unknown") - results.append( - {"path": result_dir, "status": "failed", "detail": str(e)} - ) + max_workers = max(1, min(int(eval_workers), len(target_dirs))) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_map = { + executor.submit(_run_eval_result_worker, result_dir, eval_overwrite): result_dir + for result_dir in target_dirs + } + completed = 0 + for future in as_completed(future_map): + completed += 1 + _update_progress_status(completed, total) + try: + results.append(future.result()) + except Exception as e: + result_dir = future_map.get(future, "unknown") + results.append( + {"path": result_dir, "status": "failed", "detail": str(e)} + ) _update_progress_status(total, total) finally: @@ -2106,4 +3623,4 @@ def _update_progress_status(done: int, total_dirs: int): if notify_when_done: _emit_eval_finished_notification( f"Eval run finished with CSV error. Success: {success_count}, Skipped: {skipped_count}, Failed: {failed_count}. {e}" - ) \ No newline at end of file + ) diff --git a/evaluation_dashboard_app/pages/6_Workflow.py b/evaluation_dashboard_app/pages/6_Workflow.py new file mode 100644 index 0000000..33e9816 --- /dev/null +++ b/evaluation_dashboard_app/pages/6_Workflow.py @@ -0,0 +1,2548 @@ +""" +Evaluator Workflow page: +- browse finished local runs and launch compare views +- monitor server-side tasks +- start new evaluator pipelines +- run download/eval from existing evaluator jobs +""" + +from __future__ import annotations + +import html +import io +import json +import os +import re +import urllib.parse +import zipfile +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Dict, List, Optional + +import streamlit as st +import requests + +from lib.db import count_recent_tasks, create_task, is_task_queue_enabled, list_recent_tasks, update_task_rq_job_id +from lib.page_chrome import ( + inject_app_page_styles, + render_page_hero, + section_header, +) +from lib.path_utils import ( + delete_run, + format_size, + get_data_root_display, + get_run_info, + list_run_directories, + resolve_run_subdirectory, + resolve_under_data_root, +) +from lib.run_metadata import ( + build_run_search_blob, + read_run_metadata, + upsert_run_metadata, +) +from lib.specsheet_report import ( + DEFAULT_TREND_TOPIC, + DETECTION_TREND_TOPIC_BY_MODEL, + parse_trend_metadata_text, +) +from lib.ui.recent_evaluator_jobs import ( + _fetch_evaluator_job_detail, + _format_source_ref_html, + _format_source_ref_text, + _render_recent_evaluator_job_retest_dialog, + _render_recent_evaluator_jobs_section, + configure_recent_evaluator_jobs_ui, +) +from lib.ui.task_history import get_task_list_current_user, render_task_list +from lib.ui.styles_download import inject_download_page_styles +from lib.user_config import UserConfig + +try: + from lib.perception_catalog_io import pkl_archive_to_parquet + + CATALOG_IO_AVAILABLE = True +except ImportError: + CATALOG_IO_AVAILABLE = False + +_JST = timezone(timedelta(hours=9)) +_TASK_LIST_MAX_ROWS = 200 +_TASK_LIST_SINCE_DAYS = 7 +_RELEASE_PERFORMANCE_CATALOG_ID = "e36d75b9-6c3a-4970-9b9b-5cd13f7a9da3" +_RELEASE_PERFORMANCE_INTEGRATION_ID = "96ad8fba-0228-4c2b-9166-07d4de1a0760" +_RELEASE_DEVOPS_CATALOG_ID = "ab0f8498-cc1b-4726-836f-e18e8bcb3200" +_RELEASE_DEVOPS_INTEGRATION_ID = "295cff78-9bc9-4d60-b7aa-f95be6ff96a4" +_RELEASE_OPTIONAL_CATALOG_ID = "09039022-ec91-41bf-9e93-fdefccdfc9bc" +_RELEASE_SKIP_LARGE_FILE = True +_RELEASE_LARGE_FILE_MB = 50.0 +_RELEASE_TREND_TOPIC_OPTIONS = { + "Prediction / object recognition": DEFAULT_TREND_TOPIC, + "ML model / CenterPoint": DETECTION_TREND_TOPIC_BY_MODEL["centerpoint"], + "ML model / BEVFusion": DETECTION_TREND_TOPIC_BY_MODEL["bevfusion"], + "Custom": "", +} +_TASK_HISTORY_RANGE_OPTIONS = { + "7 days": 7, + "30 days": 30, + "90 days": 90, + "All": None, +} + + +st.set_page_config( + page_title="Evaluator Workflow", + layout="wide", + initial_sidebar_state="collapsed", +) +inject_app_page_styles() +inject_download_page_styles() + + +_user_config = UserConfig(warning_fn=st.warning) + + +def get_config_value(key: str, default=None): + return _user_config.get(key, default) + + +def set_config_value(key: str, value) -> None: + _user_config.set(key, value) + + +def _to_jst(dt): + if dt is None: + return None + try: + if getattr(dt, "tzinfo", None) is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(_JST) + except Exception: + return None + + +def _load_catalog_presets(): + app_root = Path(__file__).parent.parent + catalogs_filename = "catalogs.json" + search_paths = [ + app_root / catalogs_filename, + Path(os.environ.get("CATALOGS_PATH", "")), + Path.cwd() / catalogs_filename, + ] + catalogs = [] + loaded_path = None + load_error = None + for path in search_paths: + if path.exists() and path.is_file(): + try: + with open(path, "r", encoding="utf-8") as handle: + data = json.load(handle) + if isinstance(data, dict): + catalogs = data.get("catalogs", []) + elif isinstance(data, list): + catalogs = data + else: + catalogs = [] + loaded_path = str(path) + load_error = None + break + except Exception as exc: + load_error = str(exc) + presets = [] + for item in catalogs: + if not isinstance(item, dict): + continue + display_name = item.get("display_name") or item.get("name") or item.get("catalog_id", "Unknown") + presets.append({**item, "display_name": display_name}) + return presets, loaded_path, load_error + + +def _fetch_server_catalogs(project_id: str, environment: str) -> List[Dict[str, str]]: + """Fetch available catalogs for the project on demand.""" + if not project_id: + return [] + import os + from lib.WebAPI import catalogAPI + + os.environ["AUTH_PROFILE"] = environment or "default" + response = catalogAPI(project_id=project_id).list_catalogs() + response.raise_for_status() + data = response.json() + raw_catalogs = data.get("catalogs", []) if isinstance(data, dict) else data + options: List[Dict[str, str]] = [] + for item in raw_catalogs or []: + if not isinstance(item, dict): + continue + catalog_id = str(item.get("id") or item.get("catalog_id") or "").strip() + display_name = str(item.get("display_name") or item.get("name") or catalog_id).strip() + if not catalog_id or not display_name: + continue + options.append( + { + "catalog_id": catalog_id, + "display_name": display_name, + "description": str(item.get("description") or "").strip(), + } + ) + options.sort(key=lambda item: item["display_name"].lower()) + return options + + +def _resolve_integration_id_for_catalog(project_id: str, environment: str, catalog_id: str) -> str: + """Resolve the most relevant active integration for a catalog.""" + if not project_id or not catalog_id: + return "" + from lib import evaluator_api + + os.environ["AUTH_PROFILE"] = environment or "default" + api = evaluator_api.EvaluationRunAPI() + url = f"{api.api_base_url}/projects/{project_id}/integrations" + response = api.request(url, {"catalog_id": catalog_id, "size": 100}, method="GET") + if response is None: + raise RuntimeError("No response returned while loading integrations.") + if response.status_code != 200: + raise RuntimeError(f"Failed to load integrations: status={response.status_code}") + + payload = json.loads(response.content) + integrations = payload.get("integrations", []) or [] + active = [ + item for item in integrations + if isinstance(item, dict) + and str(item.get("catalog_id") or "").strip() == catalog_id + and not bool(item.get("deleted")) + ] + if not active: + raise RuntimeError("No active integration was found for the selected catalog.") + + def _sort_key(item: Dict[str, object]) -> tuple: + return ( + str(item.get("updated_at") or ""), + int(item.get("version_id") or 0), + str(item.get("id") or ""), + ) + + active.sort(key=_sort_key, reverse=True) + return str(active[0].get("id") or "").strip() + + +def _enqueue_task(task_type: str, params: dict) -> Optional[str]: + try: + session_id = get_task_list_current_user() + task_id = create_task(task_type, params, session_id=session_id) + if not task_id: + return None + + from redis import Redis + from rq import Queue + from worker.tasks import run_job + + redis_url = os.environ.get("REDIS_URL", "redis://localhost:6379") + redis_conn = Redis.from_url(redis_url) + queue = Queue( + name=os.environ.get("RQ_QUEUE", "default"), + connection=redis_conn, + default_timeout="7d", + ) + job = queue.enqueue( + run_job, + task_id, + task_type, + params, + job_timeout="7d", + result_ttl="7d", + ) + rq_id = getattr(job, "id", None) + if rq_id: + update_task_rq_job_id(task_id, str(rq_id)) + return task_id + except Exception as exc: + st.error(f"Failed to enqueue task: {exc}") + return None + + +def _make_default_output_path(branch_name: str) -> str: + import re + + clean_branch = re.sub(r"[^\w]", "_", branch_name.strip("/")) if branch_name else "eval" + clean_branch = re.sub(r"_+", "_", clean_branch).strip("_") + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + return f"eval_{clean_branch}_{ts}" + + +def _safe_output_part(value: object, fallback: str) -> str: + text = re.sub(r"[^\w.\-]+", "_", str(value or "").strip()).strip("._") + return text or fallback + + +def _catalog_preset_emoji(preset_name: str, *, has_custom_catalog: bool = False) -> str: + mapping = { + "Build Test Catalog": "🛠️", + "Performance Test": "📈", + "Old performance test": "🕰️", + "Devops Test": "⚙️", + "Usecase Performance Catalog": "🧭", + "L4 regression test": "⚠️", + } + normalized = str(preset_name or "").strip() + if normalized in mapping: + return mapping[normalized] + if has_custom_catalog: + return "🧩" + return "📦" + + +def _make_auto_workflow_description( + target_name: str, + preset_name: str = "", + *, + has_custom_catalog: bool = False, +) -> str: + import re + + clean_target = str(target_name or "").strip() or "default" + clean_target = re.sub(r"\s+", " ", clean_target) + stamp = datetime.now().strftime("%m-%d %H:%M") + return ( + f"🚀 evaluator workflow [{clean_target}] [{stamp}] " + f"{_catalog_preset_emoji(preset_name, has_custom_catalog=has_custom_catalog)}" + ) + + +def _make_auto_release_workflow_description(target_name: str) -> str: + clean_target = str(target_name or "").strip() or "default" + clean_target = re.sub(r"\s+", " ", clean_target) + stamp = datetime.now().strftime("%m-%d %H:%M") + return f"🚀 release workflow [{clean_target}] [{stamp}]" + + +def _make_default_release_pilot_auto_version(target_name: str) -> str: + target = str(target_name or "").strip() + match = re.search(r"v?(\d+\.\d+\.\d+)", target) + if match: + return f"Pilot.Auto v{match.group(1)}" + return f"Pilot.Auto {target}" if target else "Pilot.Auto release" + + +def _make_default_release_metadata_text(target_name: str) -> str: + release_group = _safe_output_part(target_name, "release") + pilot_auto_version = _make_default_release_pilot_auto_version(target_name) + description = f"{target_name} release data update" if target_name else "Release data update" + date = datetime.now(_JST).strftime("%Y.%m.%d") + return ( + "tags: [trend]\n" + f"release_group: {release_group}\n" + f'pilot_auto_version: "{pilot_auto_version}"\n' + f"version_abbr: {_safe_output_part(pilot_auto_version.replace('Pilot.Auto', '').strip(), 'release')[:16]}\n" + "data_count: 99,776+\n" + f"description: {description}\n" + f"date: {date}\n" + f"topic_name: {DEFAULT_TREND_TOPIC}\n" + ) + + +def _extract_release_metadata_topic(text: str) -> str: + try: + metadata = parse_trend_metadata_text(text) + return str(metadata.get("topic_name") or DEFAULT_TREND_TOPIC).strip() + except Exception: + match = re.search(r"(?m)^topic_name\s*:\s*['\"]?([^'\"\n#]+)", text or "") + return match.group(1).strip() if match else DEFAULT_TREND_TOPIC + + +def _replace_release_metadata_topic(text: str, topic: str) -> str: + topic = str(topic or "").strip() + if not topic: + return text + line = f"topic_name: {topic}" + if re.search(r"(?m)^topic_name\s*:", text or ""): + return re.sub(r"(?m)^topic_name\s*:.*$", line, text) + return (text.rstrip() + "\n" + line + "\n") if text else line + "\n" + + +def _format_run_mtime(mtime: float) -> str: + if not mtime: + return "—" + try: + return datetime.fromtimestamp(mtime, tz=_JST).strftime("%Y-%m-%d %H:%M JST") + except Exception: + return "—" + + +def _build_overview_url(run_a: str, compare_runs: Optional[List[str]] = None) -> str: + query = {"mode": "single", "run_a": run_a} + valid_compare_runs = [str(name).strip() for name in (compare_runs or []) if str(name).strip()] + if valid_compare_runs: + query["mode"] = "compare" + for idx, run_name in enumerate(valid_compare_runs[:4]): + query[f"run_{chr(98 + idx)}"] = run_name + return f"/?{urllib.parse.urlencode(query)}" + + +def _format_metadata_time(value: object) -> str: + if not value: + return "—" + if isinstance(value, datetime): + dt = value + else: + try: + dt = datetime.fromisoformat(str(value).replace("Z", "+00:00")) + except Exception: + return str(value) + if getattr(dt, "tzinfo", None) is None: + dt = dt.replace(tzinfo=timezone.utc) + try: + return dt.astimezone(_JST).strftime("%Y-%m-%d %H:%M JST") + except Exception: + return str(value) + + +def _metadata_text(value: object) -> str: + text = str(value or "").strip() + return text or "—" + + +def _run_user_label(subject_id: str, environment: str) -> str: + subject = str(subject_id or "").strip() + if not subject: + return "(Auto)" + if not subject.startswith("t4:"): + return subject + try: + profile = _resolve_subject_name(subject, environment or "default") + name = str(profile.get("name") or subject).strip() + return name or subject + except Exception: + return "(Auto)" + + +def _catalog_url(project_id: str, catalog_id: str, metadata_url: str = "") -> str: + direct_url = str(metadata_url or "").strip() + if direct_url: + return direct_url + project = str(project_id or "").strip() + catalog = str(catalog_id or "").strip() + if project and catalog: + return f"https://evaluation.tier4.jp/evaluation/vehicle_catalogs/{catalog}?project_id={project}" + return "" + + +@st.cache_data(ttl=24 * 3600, show_spinner=False) +def _catalog_preset_name_map() -> Dict[str, str]: + presets, _, _ = _load_catalog_presets() + mapping: Dict[str, str] = {} + for item in presets: + if not isinstance(item, dict): + continue + catalog_id = str(item.get("catalog_id") or "").strip() + display_name = str(item.get("display_name") or item.get("name") or "").strip() + if catalog_id and display_name: + mapping[catalog_id] = display_name + return mapping + + +def _catalog_label_for_run(catalog_id: str, catalog_name: str) -> str: + resolved_name = str(catalog_name or "").strip() + if resolved_name: + return resolved_name + catalog = str(catalog_id or "").strip() + if not catalog: + return "—" + preset_match = _catalog_preset_name_map().get(catalog, "").strip() + return preset_match or catalog + + +@st.cache_data(ttl=15, show_spinner=False) +def _load_local_runs() -> List[Dict[str, object]]: + runs: List[Dict[str, object]] = [] + for run_path in list_run_directories(): + info = get_run_info(run_path) + metadata = read_run_metadata(run_path) + task_meta = metadata.get("task") if isinstance(metadata.get("task"), dict) else {} + request_meta = metadata.get("request") if isinstance(metadata.get("request"), dict) else {} + evaluator_meta = metadata.get("evaluator") if isinstance(metadata.get("evaluator"), dict) else {} + description = str( + request_meta.get("description") + or evaluator_meta.get("description") + or "" + ).strip() + requested_by = str( + evaluator_meta.get("scheduled_by") + or task_meta.get("requested_by") + or "" + ).strip() + environment = str(request_meta.get("environment") or "default").strip() or "default" + requested_by_label = _run_user_label(requested_by, environment) + task_type = str(task_meta.get("type") or metadata.get("source_mode") or "").strip() + task_status = str(task_meta.get("status") or "").strip() + evaluator_job_id = str( + evaluator_meta.get("job_id") + or request_meta.get("job_id") + or "" + ).strip() + evaluator_report_url = str(evaluator_meta.get("report_url") or "").strip() + evaluator_title = str( + evaluator_meta.get("title") + or description + or evaluator_job_id + or "" + ).strip() + evaluator_target = str( + evaluator_meta.get("target") + or request_meta.get("target_name") + or "" + ).strip() + catalog_id = str( + evaluator_meta.get("catalog_id") + or request_meta.get("catalog_id") + or "" + ).strip() + catalog_name = str(evaluator_meta.get("catalog_name") or "").strip() + catalog_label = _catalog_label_for_run(catalog_id, catalog_name) + catalog_url = _catalog_url( + str(request_meta.get("project_id") or "").strip(), + catalog_id, + str(evaluator_meta.get("catalog_url") or "").strip(), + ) + case_totals = evaluator_meta.get("case_totals") if isinstance(evaluator_meta.get("case_totals"), dict) else {} + passed_count = int(case_totals.get("success", 0) or 0) + failed_count = int(case_totals.get("failed", 0) or 0) + canceled_count = int(case_totals.get("canceled", 0) or 0) + search_blob = build_run_search_blob( + run_path, + metadata, + extra_values=[ + description, + requested_by, + requested_by_label, + task_type, + task_status, + evaluator_job_id, + catalog_id, + catalog_name, + evaluator_target, + ], + ) + runs.append( + { + "name": info["name"], + "run_path": run_path, + "path_display": f"{get_data_root_display()}/{info['name']}", + "size": format_size(info["size_bytes"]), + "mtime": float(info["mtime"] or 0), + "mtime_date": _to_jst(datetime.fromtimestamp(float(info["mtime"] or 0), tz=timezone.utc)).date() if info["mtime"] else None, + "modified": _format_run_mtime(info["mtime"]), + "has_summary": bool(info["has_summary"]), + "has_score": bool(info["has_score"]), + "has_parquet": bool(info["has_parquet"]), + "metadata": metadata, + "description": description, + "requested_by": requested_by, + "requested_by_label": requested_by_label, + "environment": environment, + "project_id": str(request_meta.get("project_id") or "").strip(), + "task_type": task_type, + "task_status": task_status, + "evaluator_job_id": evaluator_job_id, + "evaluator_report_url": evaluator_report_url, + "evaluator_title": evaluator_title, + "evaluator_target": evaluator_target, + "branch_label": evaluator_target, + "evaluator_git_sha": str(evaluator_meta.get("git_sha") or "").strip(), + "evaluator_git_ref_url": str(evaluator_meta.get("git_ref_url") or "").strip(), + "evaluator_git_commit_url": str(evaluator_meta.get("git_commit_url") or "").strip(), + "evaluator_source_url": str(evaluator_meta.get("source_url") or "").strip(), + "evaluator_source_repo_label": str(evaluator_meta.get("source_repo_label") or "").strip(), + "catalog_id": catalog_id, + "catalog_name": catalog_name, + "catalog_label": catalog_label, + "catalog_url": catalog_url, + "passed_count": passed_count, + "failed_count": failed_count, + "canceled_count": canceled_count, + "search_blob": search_blob, + } + ) + runs.sort(key=lambda row: (-float(row["mtime"]), str(row["name"]).lower())) + return runs + + +@st.cache_data(ttl=24 * 3600, show_spinner=False) +def _resolve_subject_name(subject_id: str, environment: str) -> Dict[str, str]: + subject = str(subject_id or "").strip() + if not subject or not subject.startswith("t4:"): + return {"subject_id": subject, "name": subject, "email": ""} + org_id = os.environ.get( + "WEBAUTO_ORGANIZATION_ID", + "5a21621d-6968-4f7d-94f8-99cfb77b6e71", + ).strip() + if not org_id: + return {"subject_id": subject, "name": subject, "email": ""} + os.environ["AUTH_PROFILE"] = environment or "default" + from webautoauth.token import HttpService, TokenSource, load_config + + config = load_config() + token_source = TokenSource(HttpService(config)) + access_token = token_source.get_token().access_token + quoted_subject = urllib.parse.quote(subject, safe="") + url = f"https://auth.web.auto/v2/organizations/{org_id}/members/{quoted_subject}" + response = requests.get( + url, + headers={"Authorization": f"Bearer {access_token}", "accept": "application/json"}, + timeout=10, + ) + response.raise_for_status() + data = response.json() + return { + "subject_id": str(data.get("subject_id") or subject).strip(), + "name": str(data.get("name") or subject).strip(), + "email": str(data.get("email") or "").strip(), + } + + +def _inject_workflow_page_styles() -> None: + st.markdown( + """ + + """, + unsafe_allow_html=True, + ) + + +def _build_local_run_artifact_list(run_name: str) -> tuple[Optional[Path], list[tuple[Path, str]], str]: + run_path, err = resolve_run_subdirectory(run_name) + if err: + return None, [], err + assert run_path is not None + to_zip: list[tuple[Path, str]] = [] + summary_file = run_path / "Summary.csv" + score_file = run_path / "Score.csv" + if summary_file.is_file(): + to_zip.append((summary_file, "Summary.csv")) + if score_file.is_file(): + to_zip.append((score_file, "Score.csv")) + for pq in sorted(run_path.glob("*.parquet"), key=lambda p: p.name.lower()): + to_zip.append((pq, pq.name)) + return run_path, to_zip, "" + + +def _render_local_run_download_dialog(run_name: str) -> None: + run_path, to_zip, err = _build_local_run_artifact_list(run_name) + if err: + st.error(err) + return + if run_path is None: + st.error("Run path could not be resolved.") + return + + prepared_key = f"workflow_zip_prepared::{run_name}" + st.caption("Download the generated local artifacts for this run as one ZIP.") + if not to_zip: + st.info("This run has no Summary.csv, Score.csv, or top-level `.parquet` files.") + return + + st.caption(f"**{len(to_zip)}** file(s): {', '.join(arc for _, arc in to_zip)}") + prepared = st.session_state.get(prepared_key) + + if st.button("Prepare ZIP", key=f"workflow_prepare_zip::{run_name}", use_container_width=True): + buf = io.BytesIO() + zip_errors: list[str] = [] + included: list[str] = [] + with st.spinner("Building ZIP…"): + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + for fpath, arcname in to_zip: + try: + zf.write(fpath, arcname=arcname) + included.append(arcname) + except OSError as exc: + zip_errors.append(f"{arcname}: {exc}") + for msg in zip_errors: + st.warning(msg) + if included: + safe_stem = re.sub(r'[<>:"/\\|?*\x00-\x1f]', "_", run_name).strip() or "run" + st.session_state[prepared_key] = { + "data": buf.getvalue(), + "file_name": f"{safe_stem}_artifacts.zip", + } + prepared = st.session_state.get(prepared_key) + else: + st.session_state.pop(prepared_key, None) + prepared = None + st.error("Could not add any files to the ZIP.") + + if prepared and prepared.get("data"): + st.download_button( + label=f"Download {prepared['file_name']}", + data=prepared["data"], + file_name=prepared["file_name"], + mime="application/zip", + key=f"workflow_dl_zip::{run_name}", + use_container_width=True, + ) + + +def _render_local_run_delete_dialog(run_name: str) -> None: + st.warning("This deletes the local run directory permanently.") + confirm = st.text_input( + "Type the run name to confirm", + value="", + placeholder=run_name, + key=f"workflow_delete_confirm::{run_name}", + ).strip() + if st.button("Delete run", key=f"workflow_delete_btn::{run_name}", type="primary", use_container_width=True): + if confirm != run_name: + st.error("Confirmation text does not match the run name.") + return + ok, msg = delete_run(run_name) + if ok: + st.session_state.pop("workflow_local_run_detail", None) + st.session_state.pop("workflow_local_run_download", None) + st.session_state.pop("workflow_local_run_delete", None) + st.session_state.pop(f"workflow_zip_prepared::{run_name}", None) + st.success(msg) + _load_local_runs.clear() + st.rerun() + st.error(msg) + + +def _render_local_runs_header() -> None: + header_cols = st.columns([0.45, 2.35, 0.72, 1.45, 1.55, 1.0, 1.0, 1.22, 0.68, 1.55], gap="small") + header_cols[0].markdown('
Pick
', unsafe_allow_html=True) + header_cols[1].markdown('
Name
', unsafe_allow_html=True) + header_cols[2].markdown('
User
', unsafe_allow_html=True) + header_cols[3].markdown('
Catalog
', unsafe_allow_html=True) + header_cols[4].markdown('
Evaluator
', unsafe_allow_html=True) + header_cols[5].markdown('
Result
', unsafe_allow_html=True) + header_cols[6].markdown('
Updated
', unsafe_allow_html=True) + header_cols[7].markdown('
Files
', unsafe_allow_html=True) + header_cols[8].markdown('
Size
', unsafe_allow_html=True) + header_cols[9].markdown('
Actions
', unsafe_allow_html=True) + + +def _run_needs_source_backfill(run: Dict[str, object]) -> bool: + return bool( + str(run.get("evaluator_job_id") or "").strip() + and str(run.get("project_id") or "").strip() + and ( + not str(run.get("evaluator_git_ref_url") or "").strip() + or not str(run.get("evaluator_git_commit_url") or "").strip() + or not str(run.get("evaluator_source_url") or "").strip() + or not str(run.get("evaluator_git_sha") or "").strip() + ) + ) + + +def _backfill_local_run_source_metadata(runs: List[Dict[str, object]]) -> Dict[str, int]: + updated = 0 + skipped = 0 + failed = 0 + for run in runs: + if not _run_needs_source_backfill(run): + skipped += 1 + continue + run_path = run.get("run_path") + if not isinstance(run_path, Path): + failed += 1 + continue + project_id = str(run.get("project_id") or "").strip() + environment = str(run.get("environment") or "default").strip() or "default" + evaluator_job_id = str(run.get("evaluator_job_id") or "").strip() + try: + detail = _fetch_evaluator_job_detail(project_id, environment, evaluator_job_id) + except Exception: + failed += 1 + continue + + patch = { + "evaluator": { + "target": str(detail.get("source_label") or run.get("evaluator_target") or "").strip(), + "git_sha": str(detail.get("git_sha") or run.get("evaluator_git_sha") or "").strip(), + "git_ref_url": str(detail.get("git_ref_url") or run.get("evaluator_git_ref_url") or "").strip(), + "git_commit_url": str(detail.get("git_commit_url") or run.get("evaluator_git_commit_url") or "").strip(), + "source_url": str(detail.get("source_url") or run.get("evaluator_source_url") or "").strip(), + "source_repo_label": str(detail.get("source_repo_label") or run.get("evaluator_source_repo_label") or "").strip(), + "catalog_name": str(detail.get("catalog") or run.get("catalog_name") or "").strip(), + "catalog_url": str(detail.get("catalog_url") or run.get("catalog_url") or "").strip(), + } + } + try: + upsert_run_metadata(run_path, patch, create_missing=False) + updated += 1 + except Exception: + failed += 1 + return {"updated": updated, "skipped": skipped, "failed": failed} + + +def _render_local_run_row(run: Dict[str, object], *, selected: bool) -> bool: + name_raw = str(run["name"]) + name = html.escape(name_raw) + modified = html.escape(str(run["modified"])) + user_label = html.escape(str(run.get("requested_by_label") or "—")) + catalog_label = html.escape(str(run.get("catalog_label") or run.get("catalog_id") or "—")) + catalog_url = html.escape(str(run.get("catalog_url") or "")) + evaluator_job_id = str(run.get("evaluator_job_id") or "").strip() + evaluator_report_url = str(run.get("evaluator_report_url") or "").strip() + evaluator_target = str(run.get("evaluator_target") or "").strip() + description = str(run.get("description") or "").strip() + evaluator_title = html.escape(str(run.get("evaluator_title") or description or evaluator_job_id or "—")) + source_label = str(run.get("evaluator_target") or evaluator_target or "—").strip() + source_url = str(run.get("evaluator_git_ref_url") or run.get("evaluator_source_url") or "").strip() + source_git_sha = str(run.get("evaluator_git_sha") or "").strip() + source_commit_url = str(run.get("evaluator_git_commit_url") or "").strip() + result_label = html.escape( + f"✅ {int(run.get('passed_count') or 0)} ❌ {int(run.get('failed_count') or 0)} ⏹ {int(run.get('canceled_count') or 0)}" + ) + task_type = str(run.get("task_type") or "").strip() + task_status = str(run.get("task_status") or "").strip() + meta_bits = [bit for bit in [task_type, task_status] if bit] + flags = [ + ("Summary", bool(run["has_summary"])), + ("Score", bool(run["has_score"])), + ("Parquet", bool(run["has_parquet"])), + ] + compare_available = any(enabled for _, enabled in flags) + title_class = "wf-run-title wf-run-text" + ("" if compare_available else " wf-run-title--muted") + cell_class = "wf-run-cell wf-run-text" + ("" if compare_available else " wf-run-cell--muted") + meta_class = "wf-meta-inline" + ("" if compare_available else " wf-meta-inline--muted") + flag_wrap_class = "wf-run-flags" + ("" if compare_available else " wf-run-flags--muted") + flag_html = "".join( + f'{label}' + for label, enabled in flags + ) + if not compare_available: + flag_html += '
Unavailable for compare
' + size_label = html.escape(str(run["size"])) + checkbox_key = f"workflow_compare_pick::{name_raw}" + if not compare_available: + st.session_state[checkbox_key] = False + elif checkbox_key not in st.session_state: + st.session_state[checkbox_key] = bool(selected) + row_cols = st.columns([0.45, 2.35, 0.72, 1.45, 1.55, 1.0, 1.0, 1.22, 0.68, 1.55], gap="small") + with row_cols[0]: + checked = st.checkbox( + "Select run", + key=checkbox_key, + label_visibility="collapsed", + disabled=not compare_available, + ) + with row_cols[1]: + title_html = f'' + if meta_bits: + meta_html = html.escape(" · ".join(meta_bits[:3])) + title_html += f'
{meta_html}
' + st.markdown(title_html, unsafe_allow_html=True) + with row_cols[2]: + st.markdown(f'
{user_label}
', unsafe_allow_html=True) + with row_cols[3]: + if catalog_url and catalog_label != "—": + st.markdown( + f'', + unsafe_allow_html=True, + ) + else: + st.markdown(f'
{catalog_label}
', unsafe_allow_html=True) + with row_cols[4]: + if evaluator_report_url and evaluator_job_id: + evaluator_html = f'' + else: + evaluator_html = f'
{evaluator_title}
' + source_ref_html = _format_source_ref_html(source_label, source_url, source_git_sha, source_commit_url) + if source_ref_html and source_ref_html != "—": + evaluator_html += f'
{source_ref_html}
' + st.markdown(evaluator_html, unsafe_allow_html=True) + with row_cols[5]: + st.markdown(f'
{result_label}
', unsafe_allow_html=True) + with row_cols[6]: + st.markdown(f'
{modified}
', unsafe_allow_html=True) + with row_cols[7]: + st.markdown(f'
{flag_html}
', unsafe_allow_html=True) + with row_cols[8]: + st.markdown(f'
{size_label}
', unsafe_allow_html=True) + with row_cols[9]: + action_cols = st.columns([0.78, 0.82, 0.82], gap="small") + with action_cols[0]: + if st.button("ℹ", key=f"workflow_run_details::{name_raw}", use_container_width=True, help="Show run details"): + st.session_state["workflow_local_run_detail"] = name_raw + with action_cols[1]: + if st.button("⬇", key=f"workflow_run_download::{name_raw}", use_container_width=True, help="Prepare ZIP download"): + st.session_state["workflow_local_run_download"] = name_raw + with action_cols[2]: + if st.button("🗑", key=f"workflow_run_delete::{name_raw}", use_container_width=True, help="Delete this local run"): + st.session_state["workflow_local_run_delete"] = name_raw + return bool(checked) + + +def _render_local_run_details(run: Dict[str, object]) -> None: + metadata = run.get("metadata") if isinstance(run.get("metadata"), dict) else {} + task_meta = metadata.get("task") if isinstance(metadata.get("task"), dict) else {} + request_meta = metadata.get("request") if isinstance(metadata.get("request"), dict) else {} + evaluator_meta = metadata.get("evaluator") if isinstance(metadata.get("evaluator"), dict) else {} + download_meta = metadata.get("download") if isinstance(metadata.get("download"), dict) else {} + scenario_download_meta = metadata.get("scenario_download") if isinstance(metadata.get("scenario_download"), dict) else {} + evaluation_meta = metadata.get("evaluation") if isinstance(metadata.get("evaluation"), dict) else {} + parquet_meta = metadata.get("parquet") if isinstance(metadata.get("parquet"), dict) else {} + project_id = str(request_meta.get("project_id") or "").strip() + request_environment = str(request_meta.get("environment") or "default").strip() or "default" + evaluator_job_id = str(evaluator_meta.get("job_id") or request_meta.get("job_id") or "").strip() + evaluator_report_url = str(evaluator_meta.get("report_url") or "").strip() + evaluator_target = str(evaluator_meta.get("target") or evaluator_meta.get("target_name") or request_meta.get("target_name") or "").strip() + evaluator_detail = {} + if project_id and evaluator_job_id: + try: + evaluator_detail = _fetch_evaluator_job_detail(project_id, request_environment, evaluator_job_id) + except Exception: + evaluator_detail = {} + source_url = str( + evaluator_meta.get("git_ref_url") + or evaluator_meta.get("source_url") + or evaluator_detail.get("source_url") + or evaluator_detail.get("git_ref_url") + or "" + ).strip() + source_commit_url = str( + evaluator_meta.get("git_commit_url") + or evaluator_detail.get("git_commit_url") + or "" + ).strip() + catalog_url = str(evaluator_detail.get("catalog_url") or "").strip() + source_label = str(evaluator_meta.get("target") or evaluator_detail.get("source_label") or evaluator_target or "").strip() + source_git_sha = str(evaluator_meta.get("git_sha") or evaluator_detail.get("git_sha") or "").strip() + source_ref_text = _format_source_ref_text(source_label or evaluator_target, source_git_sha) + source_ref_html = _format_source_ref_html(source_label or evaluator_target, source_url, source_git_sha, source_commit_url) + + with st.container(border=True): + title_cols = st.columns([3.4, 1.0]) + with title_cols[0]: + st.markdown(f"### Local Run Details: `{run['name']}`") + with title_cols[1]: + if st.button("Clear", key=f"workflow_clear_run_details::{run['name']}", use_container_width=True): + st.session_state["workflow_local_run_detail"] = "" + st.rerun() + + if not metadata: + st.info("This run was created before metadata capture was added. Showing only filesystem information.") + + top_cols = st.columns(4) + top_cols[0].metric("Updated", _metadata_text(run.get("modified"))) + top_cols[1].metric("Size", _metadata_text(run.get("size"))) + top_cols[2].metric("Task type", _metadata_text(task_meta.get("type") or metadata.get("source_mode"))) + top_cols[3].metric("Task status", _metadata_text(task_meta.get("status"))) + + run_cols = st.columns(2) + with run_cols[0]: + st.caption("Run folder") + st.code(str(run.get("path_display") or run.get("name") or ""), language=None) + with run_cols[1]: + st.caption("Available outputs") + st.write( + " | ".join( + label + for label, enabled in [ + ("Summary.csv", bool(run.get("has_summary"))), + ("Score.csv", bool(run.get("has_score"))), + ("Parquet", bool(run.get("has_parquet"))), + ] + if enabled + ) + or "—" + ) + + requested_by = str(task_meta.get("requested_by") or "").strip() + requested_by = str( + evaluator_meta.get("scheduled_by") + or requested_by + or "" + ).strip() + requested_by_label = requested_by or "—" + requested_by_label = _run_user_label(requested_by, request_environment) + + task_cols = st.columns(4) + task_cols[0].text_input("Requested by", value=requested_by_label, disabled=True, key=f"run_detail_user::{run['name']}") + task_cols[1].text_input("Task ID", value=_metadata_text(task_meta.get("id")), disabled=True, key=f"run_detail_tid::{run['name']}") + task_cols[2].text_input("Created", value=_format_metadata_time(task_meta.get("created_at") or metadata.get("created_at")), disabled=True, key=f"run_detail_created::{run['name']}") + task_cols[3].text_input("Updated", value=_format_metadata_time(task_meta.get("updated_at") or metadata.get("updated_at")), disabled=True, key=f"run_detail_updated::{run['name']}") + task_error = str(task_meta.get("error_message") or "").strip() + if task_error: + st.error(task_error) + + request_cols = st.columns(4) + request_cols[0].text_input("Project", value=_metadata_text(request_meta.get("project_id")), disabled=True, key=f"run_detail_project::{run['name']}") + request_cols[1].text_input("Environment", value=_metadata_text(request_environment), disabled=True, key=f"run_detail_env::{run['name']}") + request_cols[2].text_input("Catalog ID", value=_metadata_text(evaluator_meta.get("catalog_id") or request_meta.get("catalog_id")), disabled=True, key=f"run_detail_catalog::{run['name']}") + request_cols[3].text_input("Integration ID", value=_metadata_text(evaluator_meta.get("integration_id") or request_meta.get("integration_id")), disabled=True, key=f"run_detail_integration::{run['name']}") + + detail_cols = st.columns(3) + detail_cols[0].text_input("Evaluator job ID", value=_metadata_text(evaluator_meta.get("job_id") or request_meta.get("job_id")), disabled=True, key=f"run_detail_job::{run['name']}") + detail_cols[1].text_input("Source job ID", value=_metadata_text(evaluator_meta.get("source_job_id") or request_meta.get("source_job_id")), disabled=True, key=f"run_detail_source_job::{run['name']}") + detail_cols[2].text_input("Target", value=_metadata_text(evaluator_meta.get("target") or request_meta.get("target_name")), disabled=True, key=f"run_detail_target::{run['name']}") + + st.text_input("Description", value=_metadata_text(request_meta.get("description") or evaluator_meta.get("description")), disabled=True, key=f"run_detail_desc::{run['name']}") + + if evaluator_job_id: + action_cols = st.columns([1.15, 1.15, 1.15, 2.55]) + with action_cols[0]: + if evaluator_report_url: + st.link_button("Open report", evaluator_report_url, use_container_width=True) + with action_cols[1]: + if source_url: + st.link_button("Open source", source_url, use_container_width=True) + with action_cols[2]: + if catalog_url: + st.link_button("Open catalog", catalog_url, use_container_width=True) + with action_cols[3]: + if st.button("Artifact retest", key=f"workflow_local_run_retest::{run['name']}", type="primary", use_container_width=True): + st.session_state.pop(f"recent_eval_retest_suite_selection_{evaluator_job_id}", None) + st.session_state["workflow_local_run_retest"] = str(run["name"]) + st.rerun() + + info_cols = st.columns([1.6, 2.4]) + info_cols[0].text_input( + "Evaluator job", + value=evaluator_job_id, + disabled=True, + key=f"run_detail_job_full::{run['name']}", + ) + info_cols[1].text_input( + "Source ref", + value=_metadata_text(source_ref_text), + disabled=True, + key=f"run_detail_source_ref::{run['name']}", + ) + if source_ref_html and source_ref_html != "—": + st.markdown( + f'
GitHub: {source_ref_html}
', + unsafe_allow_html=True, + ) + + if evaluator_meta: + eval_cols = st.columns(4) + eval_cols[0].text_input("Evaluator status", value=_metadata_text(evaluator_meta.get("status")), disabled=True, key=f"run_detail_estatus::{run['name']}") + eval_cols[1].text_input("Build status", value=_metadata_text(evaluator_meta.get("build_status")), disabled=True, key=f"run_detail_build::{run['name']}") + eval_cols[2].text_input("Test status", value=_metadata_text(evaluator_meta.get("test_status")), disabled=True, key=f"run_detail_test::{run['name']}") + eval_cols[3].text_input("Report URL", value=_metadata_text(evaluator_meta.get("report_url")), disabled=True, key=f"run_detail_report::{run['name']}") + fail_message = str(evaluator_meta.get("fail_message") or "").strip() + if fail_message: + st.warning(fail_message) + case_totals = evaluator_meta.get("case_totals") if isinstance(evaluator_meta.get("case_totals"), dict) else {} + if case_totals: + case_cols = st.columns(4) + case_cols[0].metric("Cases total", str(case_totals.get("total", 0))) + case_cols[1].metric("Cases success", str(case_totals.get("success", 0))) + case_cols[2].metric("Cases failed", str(case_totals.get("failed", 0))) + case_cols[3].metric("Cases canceled", str(case_totals.get("canceled", 0))) + + if download_meta or scenario_download_meta: + active_download_meta = download_meta or scenario_download_meta + download_cols = st.columns(4) + download_cols[0].text_input("Download mode", value=_metadata_text(active_download_meta.get("mode") or metadata.get("source_mode")), disabled=True, key=f"run_detail_dl_mode::{run['name']}") + download_cols[1].text_input("Download type", value=_metadata_text(download_meta.get("download_type") or request_meta.get("download_type")), disabled=True, key=f"run_detail_dl_type::{run['name']}") + download_cols[2].text_input("Phase", value=_metadata_text(download_meta.get("phase") or request_meta.get("phase")), disabled=True, key=f"run_detail_phase::{run['name']}") + download_cols[3].text_input("Skip large files", value="Yes" if bool(download_meta.get("skip_large_file") or request_meta.get("skip_large_file")) else "No", disabled=True, key=f"run_detail_skip::{run['name']}") + + count_cols = st.columns(3) + count_cols[0].metric("Download total", str(active_download_meta.get("total", 0))) + count_cols[1].metric("Download success", str(active_download_meta.get("success", 0))) + count_cols[2].metric("Download failed", str(active_download_meta.get("failed", 0))) + + if evaluation_meta: + eval_run_cols = st.columns(4) + eval_run_cols[0].text_input("Eval enabled", value="Yes" if bool(evaluation_meta.get("enabled") or request_meta.get("run_eval")) else "No", disabled=True, key=f"run_detail_eval_enabled::{run['name']}") + eval_run_cols[1].text_input("Recursive", value="Yes" if bool(evaluation_meta.get("recursive") or request_meta.get("eval_recursive")) else "No", disabled=True, key=f"run_detail_eval_recursive::{run['name']}") + eval_run_cols[2].text_input("Summary rows", value=str(evaluation_meta.get("summary_rows", "—")), disabled=True, key=f"run_detail_summary_rows::{run['name']}") + eval_run_cols[3].text_input("Score rows", value=str(evaluation_meta.get("score_rows", "—")), disabled=True, key=f"run_detail_score_rows::{run['name']}") + + if parquet_meta: + st.text_input("Parquet path", value=_metadata_text(parquet_meta.get("path")), disabled=True, key=f"run_detail_parquet::{run['name']}") + + suites = evaluator_meta.get("suites") if isinstance(evaluator_meta.get("suites"), list) else [] + failed_cases = evaluator_meta.get("failed_cases") if isinstance(evaluator_meta.get("failed_cases"), list) else [] + if suites: + with st.expander("Evaluator suites", expanded=False): + st.dataframe(suites, width="stretch", hide_index=True) + if failed_cases: + with st.expander("Failed cases", expanded=False): + st.dataframe(failed_cases, width="stretch", hide_index=True) + + with st.expander("Raw run metadata", expanded=False): + st.json(metadata or {}) + + selected_retest_run = str(st.session_state.get("workflow_local_run_retest") or "").strip() + if selected_retest_run == str(run["name"]) and evaluator_job_id: + dialog_job = { + "job_id": evaluator_job_id, + "title": str(evaluator_detail.get("title") or run.get("description") or run["name"]), + } + if callable(getattr(st, "dialog", None)): + try: + @st.dialog(f"Artifact retest · {dialog_job['title']}", width="large") + def _workflow_local_run_retest_dialog() -> None: + _render_recent_evaluator_job_retest_dialog( + project_id, + request_environment, + dialog_job, + output_path_default="", + phase_default=str(request_meta.get("phase") or "perception.object_recognition.tracking.objects"), + ) + + _workflow_local_run_retest_dialog() + finally: + if st.session_state.get("workflow_local_run_retest") == str(run["name"]): + st.session_state.pop("workflow_local_run_retest", None) + else: + st.markdown("---") + fallback_cols = st.columns([4.2, 1.0]) + with fallback_cols[0]: + st.subheader(f"Artifact retest · {dialog_job['title']}") + with fallback_cols[1]: + if st.button("Close", key=f"workflow_local_run_retest_close::{run['name']}", use_container_width=True): + st.session_state.pop("workflow_local_run_retest", None) + st.rerun() + _render_recent_evaluator_job_retest_dialog( + project_id, + request_environment, + dialog_job, + output_path_default="", + phase_default=str(request_meta.get("phase") or "perception.object_recognition.tracking.objects"), + ) + + +def _render_local_runs_section() -> None: + section_header("Local Runs", "") + runs = _load_local_runs() + if not runs: + st.markdown('
No finished runs were found on this server yet.
', unsafe_allow_html=True) + return + missing_source_runs = sum(1 for run in runs if _run_needs_source_backfill(run)) + local_runs_toolbar_cols = st.columns([4.2, 1.2]) + with local_runs_toolbar_cols[0]: + if missing_source_runs: + st.caption(f"{missing_source_runs} run(s) are missing stored GitHub metadata.") + with local_runs_toolbar_cols[1]: + if missing_source_runs and st.button( + "Backfill GitHub", + key="workflow_backfill_local_run_source_meta", + use_container_width=True, + ): + with st.spinner("Backfilling missing GitHub metadata for local runs..."): + result = _backfill_local_run_source_metadata(runs) + _load_local_runs.clear() + if result["failed"]: + st.warning( + f"Backfill updated {result['updated']} run(s), skipped {result['skipped']} run(s), failed on {result['failed']} run(s)." + ) + else: + st.success( + f"Backfill updated {result['updated']} run(s); {result['skipped']} run(s) already had metadata." + ) + st.rerun() + + current_user_id = str(get_task_list_current_user() or "").strip() + user_options = ["All users"] + if current_user_id: + user_options.append("My runs") + unique_users = [] + seen_users = set() + user_option_subject_map = {"All users": "", "My runs": current_user_id, "(Auto)": "__auto__"} + for row in runs: + subject_id = str(row.get("requested_by") or "").strip() + label = str(row.get("requested_by_label") or "").strip() + option = label or "(Auto)" + if not subject_id: + if "(Auto)" not in user_options: + user_options.append("(Auto)") + continue + deduped_option = option + suffix = 2 + while deduped_option in seen_users and user_option_subject_map.get(deduped_option) != subject_id: + deduped_option = f"{option} [{suffix}]" + suffix += 1 + if deduped_option not in seen_users: + unique_users.append(deduped_option) + seen_users.add(deduped_option) + user_option_subject_map[deduped_option] = subject_id + user_options.extend(unique_users) + + catalog_options = ["All catalogs"] + catalog_option_id_map = {"All catalogs": ""} + unique_catalogs = [] + seen_catalogs = set() + for row in runs: + catalog_id = str(row.get("catalog_id") or "").strip() + catalog_label = str(row.get("catalog_label") or row.get("catalog_name") or catalog_id or "—").strip() + if not catalog_id: + continue + option = catalog_label or catalog_id + deduped_option = option + suffix = 2 + while deduped_option in seen_catalogs and catalog_option_id_map.get(deduped_option) != catalog_id: + deduped_option = f"{option} [{suffix}]" + suffix += 1 + if deduped_option not in seen_catalogs: + unique_catalogs.append(deduped_option) + seen_catalogs.add(deduped_option) + catalog_option_id_map[deduped_option] = catalog_id + catalog_options.extend(sorted(unique_catalogs, key=str.lower)) + + current_user_option = str(st.session_state.get("workflow_runs_user_filter", "All users")) + if current_user_option not in user_options: + current_user_option = "All users" + st.session_state["workflow_runs_user_filter"] = current_user_option + current_catalog_option = str(st.session_state.get("workflow_runs_catalog_filter", "All catalogs")) + if current_catalog_option not in catalog_options: + current_catalog_option = "All catalogs" + st.session_state["workflow_runs_catalog_filter"] = current_catalog_option + branch_options = ["All branches"] + unique_branches = sorted( + { + str(row.get("branch_label") or row.get("evaluator_target") or "").strip() + for row in runs + if str(row.get("branch_label") or row.get("evaluator_target") or "").strip() + }, + key=str.lower, + ) + branch_options.extend(unique_branches) + current_branch_option = str(st.session_state.get("workflow_runs_branch_filter", "All branches")) + if current_branch_option not in branch_options: + current_branch_option = "All branches" + st.session_state["workflow_runs_branch_filter"] = current_branch_option + + st.markdown('
', unsafe_allow_html=True) + control_cols = st.columns([1.7, 1.15, 1.1, 0.95, 0.95]) + with control_cols[0]: + st.markdown('
Search
', unsafe_allow_html=True) + run_search_input = st.text_input( + "Search runs", + value=st.session_state.get("workflow_runs_search", ""), + key="workflow_runs_search", + label_visibility="collapsed", + placeholder="Filter by name, description, job id, catalog, user", + ) + with control_cols[1]: + st.markdown('
Catalog
', unsafe_allow_html=True) + catalog_filter_input = st.selectbox( + "Catalog", + options=catalog_options, + index=catalog_options.index(current_catalog_option), + key="workflow_runs_catalog_filter", + label_visibility="collapsed", + ) + with control_cols[2]: + st.markdown('
Branch
', unsafe_allow_html=True) + branch_filter_input = st.selectbox( + "Branch", + options=branch_options, + index=branch_options.index(current_branch_option), + key="workflow_runs_branch_filter", + label_visibility="collapsed", + ) + with control_cols[3]: + st.markdown('
User
', unsafe_allow_html=True) + user_filter_input = st.selectbox( + "User", + options=user_options, + index=user_options.index(current_user_option), + key="workflow_runs_user_filter", + label_visibility="collapsed", + ) + with control_cols[4]: + st.markdown('
Rows
', unsafe_allow_html=True) + page_size_input = int( + st.selectbox( + "Rows", + options=[10, 20, 50, 100], + index=[10, 20, 50, 100].index(int(st.session_state.get("workflow_runs_page_size", 10) or 10)), + key="workflow_runs_page_size", + label_visibility="collapsed", + ) + ) + + second_control_cols = st.columns([0.92, 0.92, 0.6, 0.6, 2.4]) + with second_control_cols[0]: + st.markdown('
From
', unsafe_allow_html=True) + date_from_input = st.date_input( + "From", + value=st.session_state.get("workflow_runs_date_from", None), + key="workflow_runs_date_from", + label_visibility="collapsed", + help="Run modified-date lower bound in JST.", + ) + with second_control_cols[1]: + st.markdown('
To
', unsafe_allow_html=True) + date_to_input = st.date_input( + "To", + value=st.session_state.get("workflow_runs_date_to", None), + key="workflow_runs_date_to", + label_visibility="collapsed", + help="Run modified-date upper bound in JST.", + ) + with second_control_cols[2]: + st.markdown('
Summary
', unsafe_allow_html=True) + require_summary_input = st.toggle( + "Summary only", + value=bool(st.session_state.get("workflow_runs_summary_filter", False)), + key="workflow_runs_summary_filter", + label_visibility="collapsed", + ) + with second_control_cols[3]: + st.markdown('
Parquet
', unsafe_allow_html=True) + require_parquet_input = st.toggle( + "Parquet only", + value=bool(st.session_state.get("workflow_runs_parquet_filter", False)), + key="workflow_runs_parquet_filter", + label_visibility="collapsed", + ) + with second_control_cols[4]: + st.markdown( + '
Pick a catalog, branch, or user directly, or narrow with text and dates.
', + unsafe_allow_html=True, + ) + st.markdown('
', unsafe_allow_html=True) + + current_filter_signature = ( + str(run_search_input or ""), + str(catalog_filter_input or "All catalogs"), + str(branch_filter_input or "All branches"), + str(user_filter_input or "All users"), + date_from_input, + date_to_input, + bool(require_summary_input), + bool(require_parquet_input), + int(page_size_input), + ) + previous_filter_signature = st.session_state.get("workflow_runs_filter_signature") + if previous_filter_signature is None: + st.session_state["workflow_runs_filter_signature"] = current_filter_signature + elif previous_filter_signature != current_filter_signature: + st.session_state["workflow_runs_filter_signature"] = current_filter_signature + st.session_state["workflow_runs_page"] = 1 + + run_search = str(run_search_input).strip().lower() + selected_catalog_filter = str(catalog_filter_input).strip() + selected_branch_filter = str(branch_filter_input).strip() + selected_user_filter = str(user_filter_input).strip() + selected_date_from = date_from_input + selected_date_to = date_to_input + require_summary = bool(require_summary_input) + require_parquet = bool(require_parquet_input) + page_size = int(page_size_input) + + if selected_date_from and selected_date_to and selected_date_from > selected_date_to: + st.warning("`From` date must be earlier than or equal to `To` date.") + return + + filtered = runs + if run_search: + filtered = [row for row in filtered if run_search in str(row.get("search_blob") or row["name"]).lower()] + if selected_catalog_filter not in ("", "All catalogs"): + selected_catalog_id = str(catalog_option_id_map.get(selected_catalog_filter) or "").strip() + filtered = [row for row in filtered if str(row.get("catalog_id") or "").strip() == selected_catalog_id] + if selected_branch_filter not in ("", "All branches"): + filtered = [ + row for row in filtered + if str(row.get("branch_label") or row.get("evaluator_target") or "").strip() == selected_branch_filter + ] + if selected_user_filter == "My runs" and current_user_id: + filtered = [row for row in filtered if str(row.get("requested_by") or "").strip() == current_user_id] + elif selected_user_filter == "(Auto)": + filtered = [row for row in filtered if not str(row.get("requested_by") or "").strip()] + elif selected_user_filter not in ("", "All users", "My runs"): + selected_subject_id = str(user_option_subject_map.get(selected_user_filter) or "").strip() + filtered = [row for row in filtered if str(row.get("requested_by") or "").strip() == selected_subject_id] + if selected_date_from: + filtered = [row for row in filtered if row.get("mtime_date") and row["mtime_date"] >= selected_date_from] + if selected_date_to: + filtered = [row for row in filtered if row.get("mtime_date") and row["mtime_date"] <= selected_date_to] + if require_summary: + filtered = [row for row in filtered if bool(row["has_summary"])] + if require_parquet: + filtered = [row for row in filtered if bool(row["has_parquet"])] + + compare_ready = [ + str(row["name"]) + for row in filtered + if bool(row["has_summary"]) or bool(row["has_score"]) or bool(row["has_parquet"]) + ] + if "workflow_compare_runs" not in st.session_state: + st.session_state["workflow_compare_runs"] = compare_ready[:1] + + compare_selected = [ + name for name in st.session_state.get("workflow_compare_runs", []) + if name in compare_ready + ] + st.session_state["workflow_compare_runs"] = compare_selected + + if not filtered: + st.markdown('
No local runs matched the current filters.
', unsafe_allow_html=True) + return + + page_key = "workflow_runs_page" + current_page = max(1, int(st.session_state.get(page_key, 1))) + page_count = max(1, (len(filtered) + page_size - 1) // page_size) + if current_page > page_count: + current_page = page_count + st.session_state[page_key] = current_page + start_idx = (current_page - 1) * page_size + visible_runs = filtered[start_idx:start_idx + page_size] + visible_names = {str(run["name"]) for run in visible_runs} + + visible_end = min(len(filtered), start_idx + len(visible_runs)) + st.markdown('
', unsafe_allow_html=True) + pager_cols = st.columns([0.65, 1.0, 0.65, 3.2]) + with pager_cols[0]: + if st.button("‹", key="workflow_runs_page_prev", use_container_width=True, disabled=current_page <= 1): + current_page -= 1 + st.session_state[page_key] = current_page + st.rerun() + with pager_cols[1]: + selected_page = st.selectbox( + "Page", + options=list(range(1, page_count + 1)), + index=max(0, current_page - 1), + label_visibility="collapsed", + ) + if selected_page != current_page: + st.session_state[page_key] = int(selected_page) + current_page = int(selected_page) + start_idx = (current_page - 1) * page_size + visible_runs = filtered[start_idx:start_idx + page_size] + visible_names = {str(run["name"]) for run in visible_runs} + with pager_cols[2]: + if st.button("›", key="workflow_runs_page_next", use_container_width=True, disabled=current_page >= page_count): + current_page += 1 + st.session_state[page_key] = current_page + st.rerun() + with pager_cols[3]: + st.markdown( + f'
{start_idx + 1}{visible_end} of {len(filtered)} runs · {page_size} per page
', + unsafe_allow_html=True, + ) + st.markdown('
', unsafe_allow_html=True) + + _render_local_runs_header() + next_selected = [name for name in st.session_state.get("workflow_compare_runs", []) if name not in visible_names] + for run in visible_runs: + run_name = str(run["name"]) + if _render_local_run_row(run, selected=run_name in st.session_state.get("workflow_compare_runs", [])) and run_name in compare_ready: + next_selected.append(run_name) + st.session_state["workflow_compare_runs"] = [name for name in compare_ready if name in next_selected] + + st.markdown('
', unsafe_allow_html=True) + st.markdown('

Compare

', unsafe_allow_html=True) + compare_cols = st.columns([3.4, 1.0]) + with compare_cols[0]: + st.markdown('
Selected runs
', unsafe_allow_html=True) + selected_runs = list(st.session_state.get("workflow_compare_runs", [])) + if selected_runs: + st.caption(" | ".join(selected_runs)) + with compare_cols[1]: + st.markdown('
Action
', unsafe_allow_html=True) + if len(selected_runs) >= 2: + st.link_button("Compare", _build_overview_url(selected_runs[0], selected_runs[1:]), use_container_width=True) + elif len(selected_runs) == 1: + st.link_button("Open", _build_overview_url(selected_runs[0]), use_container_width=True) + else: + st.button("Open", disabled=True, use_container_width=True, key="workflow_compare_run_disabled") + st.markdown("
", unsafe_allow_html=True) + + download_run_name = str(st.session_state.get("workflow_local_run_download") or "").strip() + if download_run_name: + if callable(getattr(st, "dialog", None)): + @st.dialog(f"Download artifacts · {download_run_name}", width="large") + def _workflow_local_run_download_dialog() -> None: + _render_local_run_download_dialog(download_run_name) + if st.button("Close", key=f"workflow_local_run_download_close::{download_run_name}", use_container_width=True): + st.session_state.pop("workflow_local_run_download", None) + st.rerun() + + _workflow_local_run_download_dialog() + else: + st.markdown("---") + st.subheader(f"Download artifacts · {download_run_name}") + _render_local_run_download_dialog(download_run_name) + + delete_run_name = str(st.session_state.get("workflow_local_run_delete") or "").strip() + if delete_run_name: + if callable(getattr(st, "dialog", None)): + @st.dialog(f"Delete local run · {delete_run_name}", width="large") + def _workflow_local_run_delete_dialog() -> None: + _render_local_run_delete_dialog(delete_run_name) + if st.button("Cancel", key=f"workflow_local_run_delete_close::{delete_run_name}", use_container_width=True): + st.session_state.pop("workflow_local_run_delete", None) + st.rerun() + + _workflow_local_run_delete_dialog() + else: + st.markdown("---") + st.subheader(f"Delete local run · {delete_run_name}") + _render_local_run_delete_dialog(delete_run_name) + + detail_run_name = str(st.session_state.get("workflow_local_run_detail") or "").strip() + if detail_run_name: + detail_run = next((row for row in runs if str(row["name"]) == detail_run_name), None) + if detail_run is not None: + _render_local_run_details(detail_run) + + +def _render_current_tasks_section() -> None: + section_header("Current Tasks", "") + if not is_task_queue_enabled(): + st.info("Task queue not enabled. Set `USE_TASK_QUEUE=true` to track background tasks.") + return + + current_user = get_task_list_current_user() + if "workflow_task_history_range" not in st.session_state: + st.session_state["workflow_task_history_range"] = "7 days" + if "workflow_task_history_page_size" not in st.session_state: + st.session_state["workflow_task_history_page_size"] = 20 + if "workflow_task_history_page" not in st.session_state: + st.session_state["workflow_task_history_page"] = 1 + + control_cols = st.columns([1.3, 1.0, 1.0, 2.7]) + with control_cols[0]: + selected_range = st.selectbox( + "History range", + options=list(_TASK_HISTORY_RANGE_OPTIONS.keys()), + key="workflow_task_history_range", + ) + with control_cols[1]: + page_size = int( + st.selectbox( + "Rows", + options=[20, 50, 100], + key="workflow_task_history_page_size", + ) + ) + since_days = _TASK_HISTORY_RANGE_OPTIONS.get(selected_range, _TASK_LIST_SINCE_DAYS) + total_tasks = count_recent_tasks(session_id=current_user, since_days=since_days) + page_count = max(1, (total_tasks + page_size - 1) // page_size) if total_tasks else 1 + current_page = min(max(1, int(st.session_state.get("workflow_task_history_page", 1))), page_count) + st.session_state["workflow_task_history_page"] = current_page + with control_cols[2]: + selected_page = st.selectbox( + "Page", + options=list(range(1, page_count + 1)), + index=current_page - 1, + key="workflow_task_history_page_select", + ) + if int(selected_page) != current_page: + current_page = int(selected_page) + st.session_state["workflow_task_history_page"] = current_page + with control_cols[3]: + label = selected_range if since_days is not None else "all time" + st.caption(f"Showing **{total_tasks}** tasks across **{page_count}** page(s) for **{label}**.") + + offset = (current_page - 1) * page_size + use_fragment = getattr(st, "fragment", None) is not None + if use_fragment: + try: + + @st.fragment(run_every=timedelta(seconds=3)) + def _task_list_poll(): + current_tasks = list_recent_tasks( + limit=page_size, + offset=offset, + session_id=current_user, + since_days=since_days, + ) + render_task_list(current_tasks, current_user) + + _task_list_poll() + return + except (TypeError, AttributeError): + use_fragment = False + + tasks = list_recent_tasks( + limit=page_size, + offset=offset, + session_id=current_user, + since_days=since_days, + ) + has_active = render_task_list(tasks, current_user) + if st.button("Refresh tasks", key="workflow_refresh_tasks"): + st.rerun() + if has_active: + st.caption("Active jobs are shown live when possible. Use refresh if this browser does not support fragments.") + + +def _get_start_workflow_defaults() -> Dict[str, object]: + default_target = get_config_value("target_name", "beta/v4.3.2") + return { + "project_id": get_config_value("eval_project_id", "x2_dev"), + "environment": get_config_value("environment", ""), + "output_path_default": _make_default_output_path(default_target), + "download_type_default": get_config_value("eval_download_type", "Archives (ZIP)"), + "phase_default": get_config_value( + "eval_phase", + "perception.object_recognition.tracking.objects", + ), + "skip_large_file_default": True, + "large_file_mb_default": 50.0, + "keep_zip_files_default": False, + } + + +def _render_start_workflow_form( + catalog_presets: List[Dict[str, str]], + catalogs_path: Optional[str], + catalog_load_error: Optional[str], +) -> Dict[str, object]: + if catalog_load_error: + st.warning(f"Could not read catalog presets: {catalog_load_error}") + elif catalogs_path: + st.caption(f"Catalog presets loaded from `{catalogs_path}`.") + + catalog_names = [item["display_name"] for item in catalog_presets] + default_project = get_config_value("eval_project_id", "x2_dev") + default_target = get_config_value("target_name", "beta/v4.3.2") + default_download_type = get_config_value("eval_download_type", "Archives (ZIP)") + default_phase = get_config_value( + "eval_phase", + "perception.object_recognition.tracking.objects", + ) + default_poll_interval = int(get_config_value("poll_interval", 60)) + default_max_wait_hours = int(get_config_value("max_wait_hours", 24)) + default_environment = get_config_value("environment", "") + default_output = _make_default_output_path(default_target) + default_skip_large_file = True + + if "workflow_server_catalogs" not in st.session_state: + st.session_state["workflow_server_catalogs"] = [] + if "workflow_server_catalog_error" not in st.session_state: + st.session_state["workflow_server_catalog_error"] = "" + if "workflow_selected_server_catalog_id" not in st.session_state: + st.session_state["workflow_selected_server_catalog_id"] = "" + if "workflow_catalog_id" not in st.session_state: + st.session_state["workflow_catalog_id"] = "" + if "workflow_integration_id" not in st.session_state: + st.session_state["workflow_integration_id"] = "" + if "workflow_catalog_resolution_error" not in st.session_state: + st.session_state["workflow_catalog_resolution_error"] = "" + if "workflow_last_catalog_selection" not in st.session_state: + st.session_state["workflow_last_catalog_selection"] = "" + + server_catalogs = st.session_state.get("workflow_server_catalogs", []) or [] + server_catalog_labels = [ + f"{item['display_name']} ({item['catalog_id']})" for item in server_catalogs + ] + catalog_options = [""] + catalog_names + [ + label for label in server_catalog_labels if label not in catalog_names + ] + preset_by_label = {item["display_name"]: item for item in catalog_presets} + server_by_label = { + f"{item['display_name']} ({item['catalog_id']})": item for item in server_catalogs + } + + release_mode = st.checkbox( + "Release data workflow: schedule Performance Test + Devops Test", + value=bool(st.session_state.get("workflow_release_mode", False)), + key="workflow_release_mode", + help="Queues the two standard release evaluator jobs, processes both as normal app runs, then generates a release specsheet with trend data.", + ) + if release_mode: + st.info( + "Release mode uses the app-native flow: schedule Performance Test and Devops Test, create normal CSV/parquet analysis folders, write release metadata, and generate the trend-enabled specsheet PDF." + ) + + top_cols = st.columns([1.0, 1.9, 1.2]) + with top_cols[0]: + st.markdown('
Project
', unsafe_allow_html=True) + project_id = st.text_input( + "Project ID", + value=default_project, + key="workflow_project_id", + label_visibility="collapsed", + ).strip() + with top_cols[1]: + st.markdown('
Catalog
', unsafe_allow_html=True) + catalog_picker_cols = st.columns([4.2, 1.1], gap="small") + with catalog_picker_cols[0]: + selected_catalog_name = st.selectbox( + "Catalog", + options=catalog_options if catalog_options else [""], + index=catalog_options.index(st.session_state.get("workflow_catalog_name", "")) if st.session_state.get("workflow_catalog_name", "") in catalog_options else 0, + key="workflow_catalog_name", + label_visibility="collapsed", + format_func=lambda value: value or "Choose a catalog", + disabled=release_mode, + ) + with catalog_picker_cols[1]: + fetch_catalogs_clicked = st.button( + "Fetch", + key="workflow_fetch_server_catalogs", + use_container_width=True, + disabled=release_mode, + ) + if fetch_catalogs_clicked: + try: + current_environment = str(st.session_state.get("workflow_environment", default_environment) or "") + st.session_state["workflow_server_catalogs"] = _fetch_server_catalogs(project_id, current_environment) + st.session_state["workflow_server_catalog_error"] = "" + except Exception as exc: + st.session_state["workflow_server_catalogs"] = [] + st.session_state["workflow_server_catalog_error"] = str(exc) + selected_catalog = preset_by_label.get(selected_catalog_name) + selected_server_catalog = server_by_label.get(selected_catalog_name) + if "workflow_last_catalog_preset" not in st.session_state: + st.session_state["workflow_last_catalog_preset"] = "" + if st.session_state["workflow_last_catalog_preset"] != selected_catalog_name and selected_catalog: + st.session_state["workflow_catalog_id"] = str(selected_catalog.get("catalog_id") or "") + st.session_state["workflow_integration_id"] = str(selected_catalog.get("integration_id") or "") + st.session_state["workflow_selected_server_catalog_id"] = "" + st.session_state["workflow_catalog_resolution_error"] = "" + st.session_state["workflow_last_catalog_preset"] = selected_catalog_name + elif selected_server_catalog: + st.session_state["workflow_catalog_id"] = str(selected_server_catalog.get("catalog_id") or "") + st.session_state["workflow_selected_server_catalog_id"] = str(selected_server_catalog.get("catalog_id") or "") + current_environment = str(st.session_state.get("workflow_environment", default_environment) or "") + if st.session_state["workflow_last_catalog_selection"] != selected_catalog_name: + try: + st.session_state["workflow_integration_id"] = _resolve_integration_id_for_catalog( + project_id, + current_environment, + st.session_state["workflow_catalog_id"], + ) + st.session_state["workflow_catalog_resolution_error"] = "" + except Exception as exc: + st.session_state["workflow_integration_id"] = "" + st.session_state["workflow_catalog_resolution_error"] = str(exc) + st.session_state["workflow_last_catalog_selection"] = selected_catalog_name + elif st.session_state["workflow_last_catalog_selection"] != selected_catalog_name: + st.session_state["workflow_catalog_resolution_error"] = "" + st.session_state["workflow_last_catalog_selection"] = selected_catalog_name + with top_cols[2]: + st.markdown('
Branch or tag
', unsafe_allow_html=True) + target_name = st.text_input( + "Branch or Tag", + value=default_target, + key="workflow_target_name", + label_visibility="collapsed", + placeholder="beta/v4.3.2", + ).strip() + + catalog_id = str(st.session_state.get("workflow_catalog_id") or "").strip() + integration_id = str(st.session_state.get("workflow_integration_id") or "").strip() + + if st.session_state.get("workflow_server_catalog_error"): + st.warning(f"Could not fetch catalogs: {st.session_state['workflow_server_catalog_error']}") + catalog_id = str(st.session_state.get("workflow_catalog_id") or "").strip() + + picker_cols = st.columns([1.2, 1.2, 1.75]) + with picker_cols[0]: + st.markdown( + f'
{"Release output folder" if release_mode else "Output folder"}
', + unsafe_allow_html=True, + ) + output_path = st.text_input( + "Release output folder" if release_mode else "Output folder", + value=default_output, + key="workflow_output_path", + label_visibility="collapsed", + placeholder=_make_default_output_path(target_name), + help=( + "Folder under data/. Release mode creates metadata.yaml, performance/, devops/, and specsheet/ in this single folder." + if release_mode + else "Output folder under the data directory." + ), + ).strip() + with picker_cols[1]: + st.markdown('
Phase
', unsafe_allow_html=True) + phase_value = "perception.object_recognition.tracking.objects" if release_mode else default_phase + phase = st.text_input( + "Phase", + value=phase_value, + key="workflow_phase", + label_visibility="collapsed", + disabled=release_mode, + help=( + "Release mode uses this standard phase automatically for both detailed-analysis downloads." + if release_mode + else None + ), + ) + with picker_cols[2]: + st.markdown('
Description
', unsafe_allow_html=True) + description = st.text_input( + "Description", + value=get_config_value("workflow_description", ""), + key="workflow_description", + label_visibility="collapsed", + placeholder="Optional label for the evaluator run", + ).strip() + + trend_metadata: Dict[str, object] = {} + if release_mode: + metadata_default_key = "workflow_release_metadata_default_target" + metadata_text_key = "workflow_release_metadata_text" + if ( + st.session_state.get(metadata_default_key) != target_name + or metadata_text_key not in st.session_state + ): + st.session_state[metadata_text_key] = _make_default_release_metadata_text(target_name) + st.session_state[metadata_default_key] = target_name + + current_metadata_text = str(st.session_state.get(metadata_text_key) or "") + trend_topic_from_metadata = _extract_release_metadata_topic(current_metadata_text) + option_values = list(_RELEASE_TREND_TOPIC_OPTIONS.values()) + topic_labels = list(_RELEASE_TREND_TOPIC_OPTIONS.keys()) + if trend_topic_from_metadata in option_values: + topic_index = option_values.index(trend_topic_from_metadata) + else: + topic_index = topic_labels.index("Custom") + st.session_state.setdefault("workflow_release_custom_trend_topic", trend_topic_from_metadata) + + topic_label_key = "workflow_release_trend_topic_label" + topic_yaml_key = "workflow_release_trend_topic_yaml_value" + if st.session_state.get(topic_yaml_key) != trend_topic_from_metadata: + st.session_state[topic_label_key] = topic_labels[topic_index] + st.session_state[topic_yaml_key] = trend_topic_from_metadata + if topic_labels[topic_index] == "Custom": + st.session_state["workflow_release_custom_trend_topic"] = trend_topic_from_metadata + + trend_topic_label = st.selectbox( + "Trend topic", + options=topic_labels, + key=topic_label_key, + help="Used only for trend graphs. The specsheet data topic is detected from parquet/csv separately.", + ) + if trend_topic_label == "Custom": + trend_topic = st.text_input( + "Custom trend topic", + value=st.session_state.get("workflow_release_custom_trend_topic", trend_topic_from_metadata), + key="workflow_release_custom_trend_topic", + placeholder="perception.object_recognition.objects", + ).strip() + else: + trend_topic = _RELEASE_TREND_TOPIC_OPTIONS[trend_topic_label] + if trend_topic and trend_topic != trend_topic_from_metadata: + st.session_state[metadata_text_key] = _replace_release_metadata_topic( + current_metadata_text, + trend_topic, + ) + st.session_state[topic_yaml_key] = trend_topic + + metadata_text = st.text_area( + "Release metadata YAML", + key=metadata_text_key, + height=150, + help=( + "Required: tags: [trend], release_group, pilot_auto_version, data_count, description, date. " + "date must look like 2026.5.22." + ), + ) + metadata_error = "" + try: + trend_metadata = parse_trend_metadata_text(metadata_text) + if not str(trend_metadata.get("release_group") or "").strip(): + raise ValueError("Release metadata requires non-empty `release_group`.") + except Exception as exc: + metadata_error = str(exc) + trend_metadata = {} + st.error(f"Release metadata error: {metadata_error}") + + trend_topic_from_metadata = str(trend_metadata.get("topic_name") or "").strip() + if release_mode and trend_metadata and not trend_topic_from_metadata: + metadata_error = metadata_error or "Trend topic is required." + st.error("Trend topic is required.") + elif trend_metadata: + st.success("Release metadata looks valid.") + + optional_catalog_enabled = st.checkbox( + "Also run Planning Test catalog", + value=bool(st.session_state.get("workflow_release_optional_catalog_enabled", False)), + key="workflow_release_optional_catalog_enabled", + help="Schedules the Planning Test catalog in addition to Performance and DevOps.", + ) + existing_job_cols = st.columns(2) + with existing_job_cols[0]: + performance_job_id = st.text_input( + "Existing Performance job ID", + value=st.session_state.get("workflow_release_performance_job_id", ""), + key="workflow_release_performance_job_id", + placeholder="Leave empty to schedule a new Performance job", + help="Use this when the release Performance evaluator job is already scheduled or finished.", + ).strip() + with existing_job_cols[1]: + devops_job_id = st.text_input( + "Existing DevOps job ID", + value=st.session_state.get("workflow_release_devops_job_id", ""), + key="workflow_release_devops_job_id", + placeholder="Leave empty to schedule a new DevOps job", + help="Use this when the release DevOps evaluator job is already scheduled or finished.", + ).strip() + if optional_catalog_enabled: + optional_job_id = st.text_input( + "Existing Planning Test job ID", + value=st.session_state.get("workflow_release_optional_job_id", ""), + key="workflow_release_optional_job_id", + placeholder="Leave empty to schedule the Planning Test catalog", + help="Use this when the Planning Test evaluator job is already scheduled or finished.", + ).strip() + else: + optional_job_id = "" + output_dirs = "`performance/`, `devops/`, and `planning_test/`" if optional_catalog_enabled else "`performance/` and `devops/`" + st.caption( + f"Normal detailed-analysis outputs are generated automatically under {output_dirs}; existing job IDs are waited on if still running and downloaded if already finished." + ) + else: + performance_job_id = "" + devops_job_id = "" + optional_catalog_enabled = False + optional_job_id = "" + metadata_error = "" + + confirm_cols = st.columns([1.0, 1.0, 1.0] if release_mode and optional_catalog_enabled else [1.0, 1.0]) + with confirm_cols[0]: + if release_mode: + st.caption(f"Performance catalog: `{_RELEASE_PERFORMANCE_CATALOG_ID}`") + elif catalog_id: + st.caption(f"Catalog ID: `{catalog_id}`") + with confirm_cols[1]: + if release_mode: + st.caption(f"DevOps catalog: `{_RELEASE_DEVOPS_CATALOG_ID}`") + elif integration_id: + st.caption(f"Integration ID: `{integration_id}`") + if release_mode and optional_catalog_enabled: + with confirm_cols[2]: + st.caption(f"Planning Test catalog: `{_RELEASE_OPTIONAL_CATALOG_ID}`") + if st.session_state.get("workflow_catalog_resolution_error"): + st.warning(f"Could not resolve integration automatically: {st.session_state['workflow_catalog_resolution_error']}") + + if selected_catalog: + desc = str(selected_catalog.get("description") or "").strip() or "Preset selected for quick scheduling." + st.caption(f"Preset: {desc}") + elif selected_server_catalog: + desc = str(selected_server_catalog.get("description") or "").strip() + if desc: + st.caption(f"Fetched catalog: {desc}") + + with st.expander("Advanced options", expanded=False): + adv_cols = st.columns([1.0, 1.0, 0.8, 0.8]) + with adv_cols[0]: + download_type = st.radio( + "Download type", + ["Archives (ZIP)", "Result JSON"], + horizontal=True, + index=0 if default_download_type == "Archives (ZIP)" else 1, + key="workflow_download_type", + disabled=release_mode, + help=( + "Release mode uses archives, but reuses existing downloaded artifacts when the output folders already contain them." + if release_mode + else None + ), + ) + with adv_cols[1]: + environment = st.selectbox( + "Environment", + options=["", "dev", "stg", "prd"], + index=["", "dev", "stg", "prd"].index(default_environment) if default_environment in ("", "dev", "stg", "prd") else 0, + key="workflow_environment", + format_func=lambda value: value or "default", + ) + with adv_cols[2]: + poll_interval = st.slider( + "Poll interval (s)", + min_value=10, + max_value=300, + value=default_poll_interval, + step=10, + key="workflow_poll_interval", + ) + with adv_cols[3]: + max_wait_hours = st.slider( + "Max wait (h)", + min_value=1, + max_value=168, + value=default_max_wait_hours, + key="workflow_max_wait_hours", + ) + + option_cols = st.columns(5) + with option_cols[0]: + run_eval = st.checkbox( + "Run evaluation", + value=False if release_mode else True, + key="workflow_run_eval", + disabled=release_mode, + help="Release PDF generation uses parquet; eval/CSV detail checks can be run separately when needed.", + ) + with option_cols[1]: + generate_parquet = st.checkbox( + "Generate parquet", + value=False if release_mode else CATALOG_IO_AVAILABLE, + disabled=release_mode or not CATALOG_IO_AVAILABLE, + key="workflow_generate_parquet", + help="Release mode generates parquet when missing; existing parquet is enough for PDF generation.", + ) + with option_cols[2]: + skip_large_file = st.checkbox( + "Skip large files", + value=_RELEASE_SKIP_LARGE_FILE if release_mode else default_skip_large_file, + key="workflow_skip_large_file", + disabled=release_mode, + help=( + f"Release mode always skips archives at or above {_RELEASE_LARGE_FILE_MB:g} MB." + if release_mode + else "Skip unusually large archives during download." + ), + ) + with option_cols[3]: + eval_recursive = st.checkbox( + "Recursive scan", + value=False if release_mode else True, + key="workflow_eval_recursive", + disabled=release_mode, + help="Not used in release mode.", + ) + with option_cols[4]: + is_tag = st.checkbox("Target is tag", value=False, key="workflow_is_tag") + + set_config_value("eval_project_id", project_id) + set_config_value("target_name", target_name) + if not release_mode: + set_config_value("eval_download_type", download_type) + set_config_value("eval_phase", phase) + set_config_value("poll_interval", poll_interval) + set_config_value("max_wait_hours", max_wait_hours) + set_config_value("environment", environment) + set_config_value("workflow_description", description) + errors = [] + if not project_id: + errors.append("Project ID") + if not release_mode and not catalog_id: + errors.append("Catalog") + if not release_mode and not integration_id: + errors.append("Integration ID") + if not target_name: + errors.append("Branch or tag") + if release_mode: + if not trend_metadata.get("release_group"): + errors.append("Release group") + if not trend_metadata.get("pilot_auto_version"): + errors.append("Pilot.Auto version") + if not trend_metadata.get("data_count"): + errors.append("Data count") + if not trend_metadata.get("date"): + errors.append("Release date") + if metadata_error: + errors.append(metadata_error) + + resolved_output = None + path_error = "" + if output_path: + resolved_output, path_error = resolve_under_data_root(output_path, allow_missing=True) + if path_error: + errors.append(path_error) + else: + errors.append("Output folder") + + return { + "project_id": project_id, + "environment": environment, + "output_path_default": output_path or _make_default_output_path(target_name), + "download_type_default": download_type, + "phase_default": phase, + "skip_large_file_default": True, + "large_file_mb_default": 50.0, + "keep_zip_files_default": False, + "dialog_payload": { + "errors": errors, + "project_id": project_id, + "catalog_id": catalog_id, + "integration_id": integration_id, + "catalog_preset_name": selected_catalog_name, + "has_custom_catalog": bool(catalog_id and not selected_catalog), + "target_name": target_name, + "description": description, + "resolved_output": str(resolved_output) if resolved_output else "", + "environment": environment, + "is_tag": is_tag, + "download_type": download_type, + "phase": phase, + "poll_interval": int(poll_interval), + "max_wait_hours": int(max_wait_hours), + "run_eval": False if release_mode else bool(run_eval), + "generate_parquet": False if release_mode else bool(generate_parquet), + "skip_large_file": _RELEASE_SKIP_LARGE_FILE if release_mode else bool(skip_large_file), + "eval_recursive": False if release_mode else bool(eval_recursive), + "release_mode": bool(release_mode), + "trend_metadata": trend_metadata if release_mode else {}, + "performance_job_id": performance_job_id if release_mode else "", + "devops_job_id": devops_job_id if release_mode else "", + "optional_catalog_enabled": bool(optional_catalog_enabled) if release_mode else False, + "optional_catalog_id": _RELEASE_OPTIONAL_CATALOG_ID if release_mode and optional_catalog_enabled else "", + "optional_job_id": optional_job_id if release_mode and optional_catalog_enabled else "", + }, + } + + +def _render_workflow_launcher_section( + catalog_presets: List[Dict[str, str]], + catalogs_path: Optional[str], + catalog_load_error: Optional[str], +) -> Dict[str, object]: + section_header("Run Evaluator Workflow", "") + start_defaults = _get_start_workflow_defaults() + if "workflow_start_dialog_open" not in st.session_state: + st.session_state["workflow_start_dialog_open"] = False + new_job_clicked = st.button( + "Start new workflow", + key="workflow_open_start_dialog", + type="primary", + use_container_width=False, + ) + + def _reset_start_workflow_state() -> None: + fresh_target = str(get_config_value("target_name", "beta/v4.3.2") or "beta/v4.3.2") + st.session_state["workflow_catalog_name"] = "" + st.session_state["workflow_last_catalog_preset"] = "" + st.session_state["workflow_catalog_id"] = "" + st.session_state["workflow_integration_id"] = "" + st.session_state["workflow_server_catalogs"] = [] + st.session_state["workflow_server_catalog_error"] = "" + st.session_state["workflow_selected_server_catalog_id"] = "" + st.session_state["workflow_selected_server_catalog_label"] = "" + st.session_state["workflow_catalog_resolution_error"] = "" + st.session_state["workflow_last_catalog_selection"] = "" + st.session_state["workflow_release_performance_job_id"] = "" + st.session_state["workflow_release_devops_job_id"] = "" + st.session_state["workflow_release_trend_topic_label"] = "Prediction / object recognition" + st.session_state["workflow_release_custom_trend_topic"] = "" + st.session_state["workflow_output_path"] = _make_default_output_path(fresh_target) + + def _render_start_workflow_controls(*, key_suffix: str = "dialog") -> None: + st.caption("This is the full launcher for creating a new evaluator job, downloading results, and optionally running eval/parquet.") + payload = _render_start_workflow_form(catalog_presets, catalogs_path, catalog_load_error) + submit_cols = st.columns([1.15, 1.15, 3.7]) + close_clicked = submit_cols[0].button( + "Close", + key=f"workflow_close_start_{key_suffix}", + use_container_width=True, + ) + start_clicked = submit_cols[1].button( + "Start workflow", + key=f"workflow_start_btn_{key_suffix}", + type="primary", + use_container_width=True, + ) + if close_clicked: + st.session_state["workflow_start_dialog_open"] = False + st.rerun() + if start_clicked: + dialog_payload = dict(payload.get("dialog_payload") or {}) + errors = dialog_payload.get("errors", []) + if errors: + for err in errors: + st.error(f"Missing or invalid: {err}") + elif not is_task_queue_enabled(): + st.error("Task queue not enabled. Set `USE_TASK_QUEUE=true` and `REDIS_URL`.") + else: + common_params = { + "project_id": dialog_payload["project_id"], + "suite_ids": None, + "target_name": dialog_payload["target_name"], + "environment": dialog_payload["environment"], + "max_retries": 0, + "clean_build": False, + "debug": False, + "release": False, + "record_caret": False, + "log_expiration_time_in_days": 14.0, + "is_tag": dialog_payload["is_tag"], + "download_type": "archives" if dialog_payload["download_type"] == "Archives (ZIP)" else "result_json", + "phase": dialog_payload["phase"], + "skip_large_file": bool(dialog_payload.get("skip_large_file", True)), + "large_file_mb": 50.0, + "keep_zip_files": False, + "poll_interval": dialog_payload["poll_interval"], + "max_wait_seconds": dialog_payload["max_wait_hours"] * 3600, + "run_eval": dialog_payload["run_eval"], + "generate_parquet": dialog_payload["generate_parquet"], + "eval_recursive": dialog_payload["eval_recursive"], + "eval_overwrite": False, + } + if dialog_payload.get("release_mode"): + base_description = dialog_payload["description"] or _make_auto_release_workflow_description( + dialog_payload["target_name"] + ) + trend_metadata = dict(dialog_payload.get("trend_metadata") or {}) + task_id = _enqueue_task( + "run_release_specsheet_workflow", + { + "project_id": dialog_payload["project_id"], + "target_name": dialog_payload["target_name"], + "description": base_description, + "output_path": dialog_payload["resolved_output"], + "environment": dialog_payload["environment"], + "is_tag": dialog_payload["is_tag"], + "poll_interval": dialog_payload["poll_interval"], + "max_wait_seconds": dialog_payload["max_wait_hours"] * 3600, + "trend_metadata": trend_metadata, + "version": trend_metadata.get("pilot_auto_version", ""), + "topic": trend_metadata.get("topic_name", ""), + "performance_catalog_id": _RELEASE_PERFORMANCE_CATALOG_ID, + "performance_integration_id": _RELEASE_PERFORMANCE_INTEGRATION_ID, + "performance_job_id": dialog_payload.get("performance_job_id", ""), + "devops_catalog_id": _RELEASE_DEVOPS_CATALOG_ID, + "devops_integration_id": _RELEASE_DEVOPS_INTEGRATION_ID, + "devops_job_id": dialog_payload.get("devops_job_id", ""), + "optional_catalog_enabled": bool(dialog_payload.get("optional_catalog_enabled", False)), + "optional_catalog_id": dialog_payload.get("optional_catalog_id", ""), + "optional_job_id": dialog_payload.get("optional_job_id", ""), + "analysis_phase": "perception.object_recognition.tracking.objects", + "skip_large_file": _RELEASE_SKIP_LARGE_FILE, + "large_file_mb": _RELEASE_LARGE_FILE_MB, + "run_eval": bool(dialog_payload.get("run_eval", False)), + "overwrite": True, + }, + ) + if task_id: + st.session_state["workflow_start_dialog_open"] = False + st.success(f"Release specsheet workflow queued. Task id: `{task_id}`") + st.rerun() + else: + st.error("Failed to enqueue release specsheet workflow. Check worker logs.") + return + + task_id = _enqueue_task( + "run_evaluator_and_process", + { + **common_params, + "catalog_id": dialog_payload["catalog_id"], + "integration_id": dialog_payload["integration_id"], + "catalog_preset_name": dialog_payload.get("catalog_preset_name", ""), + "description": dialog_payload["description"] or _make_auto_workflow_description( + dialog_payload["target_name"], + dialog_payload.get("catalog_preset_name", ""), + has_custom_catalog=bool(dialog_payload.get("has_custom_catalog", False)), + ), + "output_path": dialog_payload["resolved_output"], + }, + ) + if task_id: + st.session_state["workflow_start_dialog_open"] = False + st.success(f"Workflow queued. Task id: `{task_id}`") + st.rerun() + else: + st.error("Failed to enqueue task. Check worker logs.") + + if new_job_clicked: + st.session_state["workflow_start_dialog_open"] = True + _reset_start_workflow_state() + + if st.session_state.get("workflow_start_dialog_open"): + if callable(getattr(st, "dialog", None)): + @st.dialog("Start evaluator workflow", width="large") + def _workflow_start_dialog() -> None: + _render_start_workflow_controls(key_suffix="dialog") + + _workflow_start_dialog() + else: + st.markdown("---") + st.subheader("Start evaluator workflow") + _render_start_workflow_controls(key_suffix="inline") + + return start_defaults + + +_inject_workflow_page_styles() +render_page_hero( + kicker="Workflow automation", + title="Evaluator Workflow", + description="Browse finished runs, watch background tasks, launch fresh evaluator pipelines, and reuse existing evaluator reports from one aligned workspace.", +) + +catalog_presets, catalogs_path, catalog_load_error = _load_catalog_presets() + +tab_tasks, tab_local = st.tabs(["Run Tasks", "Local Runs"]) + +with tab_tasks: + _render_current_tasks_section() + start_defaults = _render_workflow_launcher_section(catalog_presets, catalogs_path, catalog_load_error) + + configure_recent_evaluator_jobs_ui( + get_config_value=get_config_value, + set_config_value=set_config_value, + enqueue_task=_enqueue_task, + catalog_io_available=CATALOG_IO_AVAILABLE, + environment=str(start_defaults["environment"] or ""), + ) + + _render_recent_evaluator_jobs_section( + str(start_defaults["project_id"] or ""), + str(start_defaults["environment"] or ""), + output_path_default=str(start_defaults["output_path_default"]), + download_type_default=str(start_defaults["download_type_default"]), + phase_default=str(start_defaults["phase_default"]), + skip_large_file_default=bool(start_defaults["skip_large_file_default"]), + large_file_mb_default=float(start_defaults["large_file_mb_default"]), + keep_zip_files_default=bool(start_defaults["keep_zip_files_default"]), + show_toggle=False, + default_visible=True, + show_title=False, + ) + +with tab_local: + use_fragment = getattr(st, "fragment", None) is not None + if use_fragment: + try: + + @st.fragment + def _local_runs_fragment(): + _render_local_runs_section() + + _local_runs_fragment() + except (TypeError, AttributeError): + _render_local_runs_section() + else: + _render_local_runs_section() diff --git a/evaluation_dashboard_app/pages/7_Data_Management.py b/evaluation_dashboard_app/pages/7_Data_Management.py index 050089d..4b45b86 100644 --- a/evaluation_dashboard_app/pages/7_Data_Management.py +++ b/evaluation_dashboard_app/pages/7_Data_Management.py @@ -5,6 +5,7 @@ import io import re +import urllib.parse import zipfile import streamlit as st from pathlib import Path @@ -79,9 +80,10 @@ key="share_run_b", ) mode = "compare" if share_compare and share_run_b else "single" -q = f"mode={mode}&run_a={share_run_a}" +query = {"mode": mode, "run_a": share_run_a} if mode == "compare": - q += f"&run_b={share_run_b}" + query["run_b"] = share_run_b +q = urllib.parse.urlencode(query) st.code(q, language=None) st.caption("Example: `https://your-server:8501/?` + the query above.") diff --git a/evaluation_dashboard_app/pages/99_Deployment_Debug.py b/evaluation_dashboard_app/pages/99_Deployment_Debug.py index a85b1fb..a46d093 100644 --- a/evaluation_dashboard_app/pages/99_Deployment_Debug.py +++ b/evaluation_dashboard_app/pages/99_Deployment_Debug.py @@ -4,18 +4,23 @@ Must live as a top-level pages/*.py file so st.page_link can resolve it. Outside Docker, the default sidebar entry is hidden via CSS in lib/ui/styles_global.py; Overview shows a page_link only in Docker. """ +import json import os -from datetime import timedelta +from datetime import datetime, timedelta +from typing import Any import pandas as pd import streamlit as st +from lib.db import TASK_STATUSES, TASK_TYPES from lib.deploy_debug import ( EXEC_TIMEOUT_SEC, MAX_LOG_TAIL_LINES, compose_project_filter, container_exec_command, container_logs_tail, + database_recent_task_rows, + database_table_overview, docker_client_or_none, is_docker_debug_enabled, is_exec_enabled, @@ -27,7 +32,14 @@ running_in_docker, task_counts_by_status, ) +from lib.docker_live_structure import live_containers_mermaid, rowset_has_t4_compose_service +from lib.mermaid_render import render_mermaid from lib.page_chrome import inject_app_page_styles, render_page_hero, section_header +from lib.t4_visualizer_client import ( + ENV_BASE_URL as T4_ENV_BASE_URL, + T4VisualizerClient, + T4VisualizerError, +) st.set_page_config( layout="wide", @@ -46,19 +58,20 @@ description=( "Check Postgres, Redis, and the RQ queue; inspect redacted environment variables; " "optionally list containers and tail logs when Docker socket access is enabled; " - "optional one-shot shell commands when `EVAL_DEPLOYMENT_DEBUG_EXEC=1`." + "optional one-shot shell commands when `EVAL_DEPLOYMENT_DEBUG_EXEC=1`. " + "The Docker tab’s live diagram includes the T4 dataset server (HTTP 2D/3D rendering) when configured." ), mode="Single Run", ) -tab_env, tab_dep, tab_tasks, tab_docker = st.tabs( - ["Environment", "Dependencies", "Tasks", "Docker"] +tab_env, tab_dep, tab_tasks, tab_db, tab_docker = st.tabs( + ["Environment", "Dependencies", "Tasks", "Database", "Docker"] ) with tab_env: section_header("Deployment environment", "Sensitive connection strings are redacted.") env_df = pd.DataFrame(redacted_deployment_env_rows(), columns=["Variable", "Value"]) - st.dataframe(env_df, use_container_width=True, hide_index=True) + st.dataframe(env_df, width='stretch', hide_index=True) with tab_dep: section_header("Postgres") @@ -93,20 +106,169 @@ cdf = pd.DataFrame( [{"status": k, "count": v} for k, v in sorted(counts.items())] ) - st.dataframe(cdf, use_container_width=True, hide_index=True) + st.dataframe(cdf, width='stretch', hide_index=True) elif ok_t: st.success("No task rows yet (empty table).") else: st.error(msg_t) +def _debug_json(value: Any) -> str: + try: + return json.dumps(value, ensure_ascii=False, default=str, indent=2) + except (TypeError, ValueError): + return str(value) + + +def _task_rows_dataframe(rows: list) -> pd.DataFrame: + display_rows = [] + for row in rows: + params = row.get("parameters") or {} + if not isinstance(params, dict): + params = {} + display_rows.append( + { + "created_at": row.get("created_at"), + "updated_at": row.get("updated_at"), + "status": row.get("status"), + "type": row.get("type"), + "session_id": row.get("session_id"), + "id": str(row.get("id") or ""), + "rq_job_id": row.get("rq_job_id"), + "job_id": params.get("job_id") + or params.get("performance_job_id") + or params.get("devops_job_id") + or params.get("source_job_id") + or "", + "output_path": params.get("output_path") or params.get("output_dir") or "", + "progress_pct": row.get("progress_pct"), + "progress_message": row.get("progress_message"), + "result_path": row.get("result_path"), + "error_message": row.get("error_message"), + } + ) + return pd.DataFrame(display_rows) + + +def _format_progress_metric(value: Any) -> str: + try: + return f"{float(value or 0):g}%" + except (TypeError, ValueError): + return "0%" + + +with tab_db: + section_header( + "Database inspector", + "Read-only view into Postgres tables and recent evaluator/task job history.", + ) + + ok_tables, msg_tables, table_rows = database_table_overview() + if ok_tables and table_rows is not None: + overview_df = pd.DataFrame(table_rows) + if not overview_df.empty: + overview_df["total_mb"] = (overview_df["total_bytes"] / (1024 * 1024)).round(2) + st.dataframe( + overview_df[["table_name", "estimated_rows", "total_mb"]], + width="stretch", + hide_index=True, + ) + else: + st.info("No public tables found.") + elif not ok_tables: + st.error(msg_tables) + + section_header("Recent job history", "Raw `tasks` rows, newest first, across all sessions.") + filters = st.columns([1.2, 1.6, 1.2, 1.2]) + with filters[0]: + status_filter = st.selectbox( + "Status", + ["All", *TASK_STATUSES], + key="deploy_db_status", + ) + with filters[1]: + type_filter = st.selectbox( + "Task type", + ["All", *TASK_TYPES], + key="deploy_db_type", + ) + with filters[2]: + row_limit = st.number_input( + "Rows", + min_value=10, + max_value=500, + value=50, + step=10, + key="deploy_db_limit", + ) + with filters[3]: + page = st.number_input( + "Page", + min_value=1, + max_value=1000, + value=1, + step=1, + key="deploy_db_page", + ) + search = st.text_input( + "Search", + key="deploy_db_search", + placeholder="Task id, job id, session, path, error text, parameters", + ) + + ok_rows, msg_rows, rows, total_rows = database_recent_task_rows( + limit=int(row_limit), + offset=(int(page) - 1) * int(row_limit), + status=None if status_filter == "All" else status_filter, + task_type=None if type_filter == "All" else type_filter, + search=search.strip() or None, + ) + if not ok_rows: + st.error(msg_rows) + elif not rows: + st.info("No task rows matched the current filters.") + else: + st.caption(f"Showing **{len(rows)}** of **{total_rows}** matching task rows.") + task_df = _task_rows_dataframe(rows) + st.dataframe(task_df, width="stretch", hide_index=True) + + id_options = [str(row.get("id") or "") for row in rows] + selected_id = st.selectbox("Inspect row", id_options, key="deploy_db_task_inspect") + selected = next((row for row in rows if str(row.get("id") or "") == selected_id), None) + if selected: + meta_cols = st.columns(4) + meta_cols[0].metric("Status", str(selected.get("status") or "—")) + meta_cols[1].metric("Type", str(selected.get("type") or "—")) + meta_cols[2].metric("Progress", _format_progress_metric(selected.get("progress_pct"))) + meta_cols[3].metric("Session", str(selected.get("session_id") or "—")[:32]) + + detail_tabs = st.tabs(["Parameters", "Result summary", "Log", "Raw row"]) + with detail_tabs[0]: + st.code(_debug_json(selected.get("parameters") or {}), language="json") + with detail_tabs[1]: + raw_summary = selected.get("result_summary") + if raw_summary: + try: + parsed = json.loads(raw_summary) if isinstance(raw_summary, str) else raw_summary + st.code(_debug_json(parsed), language="json") + except (TypeError, ValueError): + st.code(str(raw_summary), language=None) + else: + st.info("No result summary stored for this row.") + with detail_tabs[2]: + log_text = (selected.get("log_output") or "").strip() + st.code(log_text or "(empty)", language=None) + with detail_tabs[3]: + st.code(_debug_json(selected), language="json") + + def _render_docker_disabled(reason: str) -> None: st.warning(reason) st.markdown( """ **Enable Docker debug (trusted operators only)** -1. From the `deploy/` directory, ensure `docker-compose.yml` mounts `/var/run/docker.sock` into the `streamlit` service and sets `EVAL_DEPLOYMENT_DEBUG_DOCKER=1`, then run `docker compose up -d` (or `docker compose up -d --force-recreate streamlit` after editing compose). +1. From the `deploy/` directory, ensure `docker-compose.yml` mounts `/var/run/docker.sock` into each Streamlit service (`streamlit1`, `streamlit2`) and sets `EVAL_DEPLOYMENT_DEBUG_DOCKER=1`, then run `docker compose up -d` (or recreate those services after editing compose). 2. Set `EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT` in `.env` to your Compose project name (same value as in `docker compose ls`) so the UI lists only this stack’s containers. @@ -158,10 +320,104 @@ def _env_flag(name: str) -> bool: return os.environ.get(name, "").strip().lower() in ("1", "true", "yes") -with tab_docker: - section_header("Containers & logs", "Requires `EVAL_DEPLOYMENT_DEBUG_DOCKER` and `/var/run/docker.sock` in the Streamlit container.") +def _display_columns_for_containers(rows: list) -> pd.DataFrame: + """Column order for the live Docker table (hide internal full_id).""" + df = pd.DataFrame(rows) + if df.empty: + return df + preferred = [ + "name", + "state", + "health", + "compose_service", + "compose_project", + "image", + "id", + ] + cols = [c for c in preferred if c in df.columns] + rest = [c for c in df.columns if c not in cols and c != "full_id"] + return df[cols + rest] + + +def _render_live_stack_mermaid(rows: list) -> None: + """Help-style Mermaid (Clients / Edge / App Tier / T4 / …) with live container labels.""" + if not rows: + return + t4_env = os.environ.get("T4_VISUALIZER_BASE_URL", "").strip() + if t4_env: + st.caption( + "**T4 dataset server** (2D/3D): HTTP API for `/render`, `/viewer/three`, and dataset availability. " + f"`T4_VISUALIZER_BASE_URL` = `{t4_env}`. " + "The diagram shows a matching Compose service if present, otherwise a synthetic node for this URL." + ) + else: + st.caption( + "**T4 dataset server** (optional): used by Bounding Box Viewer and T4 3D Viewer. " + "Set `T4_VISUALIZER_BASE_URL` in `.env` to include it in the diagram (synthetic node). " + "Compose services named `t4_server`, `t4_visualizer`, or `t4_*` are grouped under **T4 dataset server**." + ) + + # Taller when URL is set or a t4_* Compose service is present (extra subgraph). + t4_svc = rowset_has_t4_compose_service(rows) + extra_h = 120 if (t4_env or t4_svc) else 40 + mh = min(920, 280 + 52 * len(rows) + extra_h) + render_mermaid(live_containers_mermaid(rows), height=mh) + + +def _render_t4_remote_probe(base_url: str) -> None: + """Fetch /health and /server/structure.json from the configured T4 visualizer host.""" + base = base_url.rstrip("/") + section_header( + "T4 dataset server (HTTP)", + f"Live probe of `{T4_ENV_BASE_URL}` — same service as Bounding Box / T4 3D pages. " + "Open the links on the T4 host for the server’s own HTML diagram and diagnostics.", + ) + st.markdown( + f"**On the T4 host:** [Structure (HTML)]({base}/server/structure) · " + f"[structure.json]({base}/server/structure.json) · " + f"[Health]({base}/health) · " + f"[Browser diagnostics]({base}/browser/diagnostics)" + ) + try: + client = T4VisualizerClient(base_url=base, timeout=8.0) + health = client.health() + except T4VisualizerError as ex: + st.warning(f"Could not reach T4 server (`GET /health`): {ex}") + return + except OSError as ex: + st.warning(f"Could not reach T4 server: {ex}") + return + + st.caption("GET /health") + st.json(health) + + try: + structure = client.server_structure_json() + except T4VisualizerError as ex: + if ex.status_code == 404: + st.info( + "This T4 server does not expose `/server/structure.json` yet. " + "Upgrade **t4-server** (evaluator_result_parser) or use the links above if the server is older." + ) + else: + st.warning(f"`GET /server/structure.json` failed: {ex}") + return + + mmd = structure.get("mermaid") or "" + if mmd: + st.caption("Internal architecture (returned by t4-server — same diagram as `/server/structure`)") + mh = min(520, 160 + mmd.count("\n") * 26) + render_mermaid(mmd, height=mh) + meta = structure.get("meta") + if isinstance(meta, dict) and meta: + st.caption("Server meta (uptime, caches, diagnostics)") + st.json(meta) + + +with tab_docker: client = docker_client_or_none() + if client is None: if not _env_flag("EVAL_DEPLOYMENT_DEBUG_DOCKER"): _render_docker_disabled( @@ -187,13 +443,6 @@ def _env_flag(name: str) -> bool: ) else: proj = compose_project_filter() - if proj: - st.caption(f"Filtering by Compose project label: `{proj}`") - else: - st.warning( - "Listing all containers on this Docker host. Set `EVAL_DEPLOYMENT_DEBUG_COMPOSE_PROJECT` in `.env` " - "to match `docker compose ls` and restrict the list." - ) _use_fragment = getattr(st, "fragment", None) is not None @@ -202,16 +451,24 @@ def _env_flag(name: str) -> bool: @st.fragment(run_every=timedelta(seconds=6)) def _docker_fragment(): rows, list_warn = list_containers_for_debug(client) + st.caption(f"Last refreshed (server clock): **{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}** — updates about every 6 s.") if list_warn and isinstance(list_warn, str) and list_warn.startswith("Docker list failed"): st.error(list_warn) return if list_warn: st.markdown(list_warn) + t4_probe_url = os.environ.get("T4_VISUALIZER_BASE_URL", "").strip() if not rows: st.info("No containers match the current filter.") + if t4_probe_url: + _render_t4_remote_probe(t4_probe_url) return - display_df = pd.DataFrame(rows).drop(columns=["full_id"], errors="ignore") - st.dataframe(display_df, use_container_width=True, hide_index=True) + section_header("Live container table", "Sortable columns; `full_id` stays internal for log/exec.") + display_df = _display_columns_for_containers(rows) + st.dataframe(display_df, width='stretch', hide_index=True) + _render_live_stack_mermaid(rows) + if t4_probe_url: + _render_t4_remote_probe(t4_probe_url) options = [f"{r['name']} ({r['id']})" for r in rows] id_by_label = {f"{r['name']} ({r['id']})": r["full_id"] for r in rows} @@ -228,6 +485,7 @@ def _docker_fragment(): full_id = id_by_label[pick] st.session_state.deploy_debug_cid = full_id + section_header("Logs", "Stdout/stderr from the selected container.") tail = st.slider( "Log tail (lines)", min_value=50, @@ -237,29 +495,33 @@ def _docker_fragment(): key="deploy_debug_tail", ) logs = container_logs_tail(client, full_id, tail) - st.markdown("**Logs**") st.code(logs or "(empty)", language=None) _render_docker_exec_ui(client, full_id) _docker_fragment() else: rows, list_warn = list_containers_for_debug(client) + st.caption(f"Loaded at **{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}** — use Refresh to re-query.") if list_warn and isinstance(list_warn, str) and list_warn.startswith("Docker list failed"): st.error(list_warn) elif list_warn: st.markdown(list_warn) + t4_probe_url = os.environ.get("T4_VISUALIZER_BASE_URL", "").strip() if not rows: st.info("No containers match the current filter.") + if t4_probe_url: + _render_t4_remote_probe(t4_probe_url) else: - df = pd.DataFrame(rows) - st.dataframe( - df.drop(columns=["full_id"], errors="ignore"), - use_container_width=True, - hide_index=True, - ) + _render_live_stack_mermaid(rows) + if t4_probe_url: + _render_t4_remote_probe(t4_probe_url) + section_header("Live container table", "Sortable columns; `full_id` stays internal for log/exec.") + display_df = _display_columns_for_containers(rows) + st.dataframe(display_df, width='stretch', hide_index=True) options = [f"{r['name']} ({r['id']})" for r in rows] id_by_label = {f"{r['name']} ({r['id']})": r["full_id"] for r in rows} pick = st.selectbox("Container", options=options, key="deploy_debug_pick_legacy") + section_header("Logs", "Stdout/stderr from the selected container.") tail = st.slider( "Log tail (lines)", min_value=50, @@ -270,8 +532,9 @@ def _docker_fragment(): ) full_id_legacy = id_by_label[pick] logs = container_logs_tail(client, full_id_legacy, tail) - st.markdown("**Logs**") st.code(logs or "(empty)", language=None) _render_docker_exec_ui(client, full_id_legacy) if st.button("Refresh container list"): st.rerun() + + st.page_link("pages/10_Help.py", label="Help & guide (full README, including static stack Mermaid)", icon="❔") diff --git a/evaluation_dashboard_app/pages/9_TLR_Analysis.py b/evaluation_dashboard_app/pages/9_TLR_Analysis.py index 278a956..6f9519b 100644 --- a/evaluation_dashboard_app/pages/9_TLR_Analysis.py +++ b/evaluation_dashboard_app/pages/9_TLR_Analysis.py @@ -5,7 +5,11 @@ Supports shareable URLs via query params: mode, path_a, path_b. """ +import json +import html +import os import streamlit as st +import streamlit.components.v1 as components import pandas as pd import plotly.express as px import plotly.graph_objects as go @@ -14,6 +18,7 @@ from lib.tlr_eval_analyzer import TLREvaluationAnalyzer from lib.path_utils import get_data_root, path_display, list_tlr_result_directories +from lib.t4_visualizer_client import DEFAULT_BASE_URL, ENV_BASE_URL from lib.page_chrome import ( inject_app_page_styles, render_loaded_data_section, @@ -62,7 +67,7 @@ def get_or_load_analyzer(resolved_path: str): """Load analyzer for path; cache in session_state by path.""" if not resolved_path: return None - cache_key = "tlr_analyzer_cache" + cache_key = "tlr_analyzer_cache_v2" if cache_key not in st.session_state: st.session_state[cache_key] = {} cache = st.session_state[cache_key] @@ -78,7 +83,192 @@ def get_or_load_analyzer(resolved_path: str): return cache[resolved_path] -def _render_single_tabs(analyzer, tab_criteria, tab_vehicle, tab_critical, tab_details): +def _dataframe_to_json_bytes(df: pd.DataFrame, export_kind: str) -> bytes: + """Serialize a DataFrame to a stable JSON payload for downstream viewers.""" + payload = { + "format_version": 1, + "export_kind": export_kind, + "columns": df.columns.tolist(), + "records": df.to_dict(orient="records"), + } + return json.dumps(payload, ensure_ascii=False, indent=2, default=str).encode("utf-8") + + +def _build_tlr_eval_payload_by_frame(df: pd.DataFrame | None) -> dict: + """Build per-frame TLR evaluation payload for the embedded viewer.""" + if df is None or df.empty or "frame_index" not in df.columns: + return {"type": "tlr_eval_clear"} + + frames: dict[str, dict] = {} + ordered = df.sort_values(["frame_index", "scenario"]).reset_index(drop=True) + for _, row in ordered.iterrows(): + try: + frame_key = str(int(row.get("frame_index", 0))) + except (TypeError, ValueError): + continue + if frame_key in frames: + continue + + def _float_or_none(value): + try: + return None if pd.isna(value) else float(value) + except Exception: + return None + + def _string_or_none(value): + try: + if pd.isna(value) or value == "": + return None + except Exception: + pass + return str(value) + + frames[frame_key] = { + "scenario": str(row.get("scenario", "") or ""), + "t4dataset_id": str(row.get("t4dataset_id", "") or ""), + "frame_name": str(row.get("frame_name", "") or ""), + "status": str(row.get("status", "") or ""), + "speed_kph": _float_or_none(row.get("speed_kph")), + "yaw_rate_deg_s": _float_or_none(row.get("yaw_rate_deg_s")), + "current_time": _float_or_none(row.get("current_time")), + "current_time_us": ( + int(round(float(row.get("current_time")) * 1_000_000)) + if _float_or_none(row.get("current_time")) not in (None, 0.0) + else None + ), + "traffic_light_type": str(row.get("traffic_light_type", "") or ""), + "evaluation_result": str(row.get("traffic_light_type", "") or ""), + "criteria": str(row.get("criteria", "") or ""), + "tp": _string_or_none(row.get("tp")), + "fp": _string_or_none(row.get("fp")), + "fn": _string_or_none(row.get("fn")), + "tn": _string_or_none(row.get("tn")), + } + return {"type": "tlr_eval_by_frame", "frames": frames} + + +def _render_tlr_viewer_embed(viewer_url: str, payload: dict, *, iframe_id: str, height: int = 1400) -> None: + """Embed `/viewer/tlr` and post a frame-indexed evaluation payload into the iframe.""" + payload_json = json.dumps(payload, ensure_ascii=True) + payload_hex = payload_json.encode("utf-8").hex() + iframe_src = html.escape(viewer_url, quote=True) + components.html( + ( + f'' + "" + ), + height=height + 8, + scrolling=False, + ) + + +def _render_tlr_viewer_tab(detail_sources: dict[str, pd.DataFrame | None], *, key_prefix: str) -> None: + st.subheader("Embedded traffic light viewer") + st.caption("Pick a dataset from the current TLR details, then load the external `/viewer/tlr` page inline.") + + if f"{key_prefix}_base_url" not in st.session_state: + st.session_state[f"{key_prefix}_base_url"] = ( + (os.environ.get(ENV_BASE_URL) or DEFAULT_BASE_URL).strip() or DEFAULT_BASE_URL + ) + + base_url = st.text_input( + "T4 server base URL", + key=f"{key_prefix}_base_url", + help=f"Default from env `{ENV_BASE_URL}`. The viewer URL is `/viewer/tlr?t4dataset_id=...&frame_index=...`.", + ) + + available_labels = [label for label, df in detail_sources.items() if df is not None and not df.empty] + if not available_labels: + st.info("No TLR detail rows available to drive the viewer.") + return + + source_label = available_labels[0] + if len(available_labels) > 1: + source_label = st.radio( + "Use rows from", + available_labels, + horizontal=True, + key=f"{key_prefix}_source_label", + ) + + details_df = detail_sources[source_label].copy() + details_df = details_df[details_df["t4dataset_id"].fillna("").astype(str) != ""].copy() + if details_df.empty: + st.info("The selected rows do not contain any `t4dataset_id` values.") + return + + dataset_options = sorted(details_df["t4dataset_id"].astype(str).unique().tolist()) + selected_dataset = st.selectbox( + "Candidate t4dataset_id", + dataset_options, + key=f"{key_prefix}_dataset_id", + ) + dataset_rows = details_df[details_df["t4dataset_id"].astype(str) == selected_dataset].copy() + + if dataset_rows.empty: + st.info("No rows match the current dataset selection.") + return + + dataset_rows = dataset_rows.sort_values(["scenario", "frame_index"]).reset_index(drop=True) + selected_row = dataset_rows.iloc[0] + selected_frame = int(selected_row["frame_index"]) + payload = _build_tlr_eval_payload_by_frame(dataset_rows) + + viewer_url = f"{base_url.rstrip('/')}/viewer/tlr?t4dataset_id={quote(selected_dataset, safe='')}&frame_index={selected_frame}" + st.markdown(f"[Open `/viewer/tlr` in new tab]({viewer_url})") + st.caption( + f"Using the first available frame for this dataset: `frame_index={selected_frame}` from `{selected_row['scenario']}`." + ) + + preview_cols = ["scenario", "frame_index", "status", "traffic_light_type", "criteria"] + if "frame_name" in dataset_rows.columns: + preview_cols.insert(2, "frame_name") + with st.expander("Matching rows", expanded=False): + st.dataframe(dataset_rows[preview_cols].sort_values(["scenario", "frame_index"]), width="stretch", hide_index=True) + + _render_tlr_viewer_embed(viewer_url, payload, iframe_id=f"{key_prefix}_iframe", height=1600) + + +def _render_single_tabs(analyzer, tab_criteria, tab_vehicle, tab_critical, tab_details, tab_tlr_viewer): with tab_criteria: st.subheader("Criteria: TP rate and total frames") criteria_df = analyzer.create_criteria_matrix() @@ -136,12 +326,98 @@ def _render_single_tabs(analyzer, tab_criteria, tab_vehicle, tab_critical, tab_d details_df = analyzer.get_vehicle_status_details_df() if details_df is not None and not details_df.empty: st.caption("One row per frame. Use filters to narrow down by scenario, status, or traffic light type.") - st.dataframe(details_df, width='stretch', hide_index=True) + filtered_details = details_df.copy() + all_scenarios = sorted(filtered_details["scenario"].dropna().astype(str).unique().tolist()) + all_statuses = sorted(filtered_details["status"].dropna().astype(str).unique().tolist()) + all_tlr_types = sorted(filtered_details["traffic_light_type"].dropna().astype(str).unique().tolist()) + + with st.expander("Filters & sort", expanded=False): + f1, f2, f3 = st.columns(3) + with f1: + sel_scenarios = st.multiselect( + "Scenario(s)", + options=all_scenarios, + default=[], + key="tlr_single_tab_filter_scenario", + help="Leave empty to show all scenarios.", + ) + with f2: + sel_statuses = st.multiselect( + "Vehicle status", + options=all_statuses, + default=[], + key="tlr_single_tab_filter_status", + help="Leave empty to show all statuses.", + ) + with f3: + sel_tlr_types = st.multiselect( + "Traffic light type", + options=all_tlr_types, + default=[], + key="tlr_single_tab_filter_tlr_type", + help="Leave empty to show all traffic light types.", + ) + sort_by = st.selectbox( + "Sort by", + [ + "Scenario, then frame index", + "Frame index only", + "Vehicle status, then scenario, frame index", + "Traffic light type, then scenario, frame index", + ], + key="tlr_single_tab_sort_by", + ) + + if sel_scenarios: + filtered_details = filtered_details[filtered_details["scenario"].astype(str).isin(sel_scenarios)] + if sel_statuses: + filtered_details = filtered_details[filtered_details["status"].astype(str).isin(sel_statuses)] + if sel_tlr_types: + filtered_details = filtered_details[ + filtered_details["traffic_light_type"].astype(str).isin(sel_tlr_types) + ] + + if sort_by == "Scenario, then frame index": + filtered_details = filtered_details.sort_values(["scenario", "frame_index"]).reset_index(drop=True) + elif sort_by == "Frame index only": + filtered_details = filtered_details.sort_values(["frame_index", "scenario"]).reset_index(drop=True) + elif sort_by == "Vehicle status, then scenario, frame index": + filtered_details = filtered_details.sort_values(["status", "scenario", "frame_index"]).reset_index(drop=True) + else: + filtered_details = filtered_details.sort_values( + ["traffic_light_type", "scenario", "frame_index"] + ).reset_index(drop=True) + + st.dataframe(filtered_details, width='stretch', hide_index=True) + caption = f"Showing **{len(filtered_details)}** frame(s). Total before filters: {len(details_df)}." + if sel_scenarios or sel_statuses or sel_tlr_types: + caption += " Filters applied." + st.caption(caption) + dl_col_csv, dl_col_json = st.columns(2) + with dl_col_csv: + st.download_button( + "Download as CSV", + data=filtered_details.to_csv(index=False).encode("utf-8"), + file_name="tlr_details.csv", + mime="text/csv", + key="tlr_dl_single_tab_csv", + ) + with dl_col_json: + st.download_button( + "Download as JSON", + data=_dataframe_to_json_bytes(filtered_details, export_kind="single_dataset_details"), + file_name="tlr_details.json", + mime="application/json", + key="tlr_dl_single_tab_json", + ) else: st.info("No vehicle status details available.") + with tab_tlr_viewer: + _render_tlr_viewer_tab({"Current run": analyzer.get_vehicle_status_details_df()}, key_prefix="tlr_single_viewer") + -def _render_compare_tabs(analyzer_a, analyzer_b, label_a, label_b, tab_criteria, tab_vehicle, tab_critical, tab_details): +def _render_compare_tabs(analyzer_a, analyzer_b, label_a, label_b, tab_criteria, tab_vehicle, tab_critical, tab_details, tab_tlr_viewer): with tab_criteria: st.subheader("Criteria: A vs B (TP rate and delta)") df_a = analyzer_a.create_criteria_matrix() @@ -244,11 +520,27 @@ def _render_compare_tabs(analyzer_a, analyzer_b, label_a, label_b, tab_criteria, details_b = analyzer_b.get_vehicle_status_details_df() if details_a is not None and not details_a.empty and details_b is not None and not details_b.empty: merge_keys = ["scenario", "frame_index"] - a_sub = details_a[merge_keys + ["frame_name", "status", "traffic_light_type"]].copy() - a_sub = a_sub.rename(columns={"frame_name": "frame_name_a", "status": "status_a", "traffic_light_type": f"traffic_light_type ({label_a})"}) - b_sub = details_b[merge_keys + ["frame_name", "status", "traffic_light_type"]].copy() - b_sub = b_sub.rename(columns={"frame_name": "frame_name_b", "status": "status_b", "traffic_light_type": f"traffic_light_type ({label_b})"}) + a_sub = details_a[merge_keys + ["t4dataset_id", "frame_name", "status", "traffic_light_type"]].copy() + a_sub = a_sub.rename( + columns={ + "t4dataset_id": f"t4dataset_id ({label_a})", + "frame_name": "frame_name_a", + "status": "status_a", + "traffic_light_type": f"traffic_light_type ({label_a})", + } + ) + b_sub = details_b[merge_keys + ["t4dataset_id", "frame_name", "status", "traffic_light_type"]].copy() + b_sub = b_sub.rename( + columns={ + "t4dataset_id": f"t4dataset_id ({label_b})", + "frame_name": "frame_name_b", + "status": "status_b", + "traffic_light_type": f"traffic_light_type ({label_b})", + } + ) merged = a_sub.merge(b_sub, on=merge_keys, how="inner") + dataset_col_a = f"t4dataset_id ({label_a})" + dataset_col_b = f"t4dataset_id ({label_b})" tlr_col_a = f"traffic_light_type ({label_a})" tlr_col_b = f"traffic_light_type ({label_b})" merged["_diff"] = merged[tlr_col_a] != merged[tlr_col_b] @@ -356,7 +648,7 @@ def _render_compare_tabs(analyzer_a, analyzer_b, label_a, label_b, tab_criteria, if not to_show_diff.empty: st.markdown("**Frames where traffic light type differs (A vs B)**") display_df = to_show_diff[[ - "scenario", "frame_index", + "scenario", dataset_col_a, dataset_col_b, "frame_index", tlr_col_a, tlr_col_b, "status_a", "status_b", ]].copy() @@ -373,7 +665,12 @@ def _highlight_diff_columns(series): st.caption(caption) # Download CSV csv_bytes = display_df.to_csv(index=False).encode("utf-8") - st.download_button("Download as CSV", data=csv_bytes, file_name="tlr_diff_frames.csv", mime="text/csv", key="tlr_dl_diff") + json_bytes = _dataframe_to_json_bytes(display_df, export_kind="compare_diff_frames") + dl_col_csv, dl_col_json = st.columns(2) + with dl_col_csv: + st.download_button("Download as CSV", data=csv_bytes, file_name="tlr_diff_frames.csv", mime="text/csv", key="tlr_dl_diff") + with dl_col_json: + st.download_button("Download as JSON", data=json_bytes, file_name="tlr_diff_frames.json", mime="application/json", key="tlr_dl_diff_json") else: st.info( f"No frames with different traffic light type between {label_a} and {label_b}" @@ -382,7 +679,7 @@ def _highlight_diff_columns(series): else: st.markdown("**All frames (A vs B)** — rows where traffic light type differs are highlighted.") display_df = to_show_merged[[ - "scenario", "frame_index", + "scenario", dataset_col_a, dataset_col_b, "frame_index", tlr_col_a, tlr_col_b, "status_a", "status_b", ]].copy() @@ -402,7 +699,12 @@ def _highlight_diff_rows(df): caption += " Filters applied." st.caption(caption) csv_bytes = display_df.to_csv(index=False).encode("utf-8") - st.download_button("Download as CSV", data=csv_bytes, file_name="tlr_compare_all_frames.csv", mime="text/csv", key="tlr_dl_all") + json_bytes = _dataframe_to_json_bytes(display_df, export_kind="compare_all_frames") + dl_col_csv, dl_col_json = st.columns(2) + with dl_col_csv: + st.download_button("Download as CSV", data=csv_bytes, file_name="tlr_compare_all_frames.csv", mime="text/csv", key="tlr_dl_all") + with dl_col_json: + st.download_button("Download as JSON", data=json_bytes, file_name="tlr_compare_all_frames.json", mime="application/json", key="tlr_dl_all_json") else: st.caption("Need details from both A and B to show traffic light type differences.") st.markdown("---") @@ -425,16 +727,37 @@ def _highlight_diff_rows(df): details_df = details_df[details_df["scenario"].isin(single_sel)] st.dataframe(details_df, width='stretch', hide_index=True) if not details_df.empty: - st.download_button( - "Download as CSV", - data=details_df.to_csv(index=False).encode("utf-8"), - file_name=f"tlr_details_{view_which.replace(' ', '_')}.csv", - mime="text/csv", - key="tlr_dl_single", - ) + csv_name = f"tlr_details_{view_which.replace(' ', '_')}.csv" + json_name = f"tlr_details_{view_which.replace(' ', '_')}.json" + dl_col_csv, dl_col_json = st.columns(2) + with dl_col_csv: + st.download_button( + "Download as CSV", + data=details_df.to_csv(index=False).encode("utf-8"), + file_name=csv_name, + mime="text/csv", + key="tlr_dl_single", + ) + with dl_col_json: + st.download_button( + "Download as JSON", + data=_dataframe_to_json_bytes(details_df, export_kind="per_dataset_details"), + file_name=json_name, + mime="application/json", + key="tlr_dl_single_json", + ) else: st.info("No vehicle status details available.") + with tab_tlr_viewer: + _render_tlr_viewer_tab( + { + label_a: analyzer_a.get_vehicle_status_details_df(), + label_b: analyzer_b.get_vehicle_status_details_df(), + }, + key_prefix="tlr_compare_viewer", + ) + # ----- Sidebar: mode and TLR directory selection ----- st.sidebar.markdown("##### TLR data") @@ -568,10 +891,10 @@ def _highlight_diff_rows(df): f"Worst: **{stats['worst_criteria']}** (TP rate {stats['worst_tp_rate']:.2%})" ) - tab_criteria, tab_vehicle, tab_critical, tab_details = st.tabs([ - "Criteria matrix", "Vehicle status vs TLR type", "Critical & priority zones", "Vehicle status details", + tab_criteria, tab_vehicle, tab_critical, tab_details, tab_tlr_viewer = st.tabs([ + "Criteria matrix", "Vehicle status vs TLR type", "Critical & priority zones", "Vehicle status details", "TLR viewer", ]) - _render_single_tabs(analyzer_a, tab_criteria, tab_vehicle, tab_critical, tab_details) + _render_single_tabs(analyzer_a, tab_criteria, tab_vehicle, tab_critical, tab_details, tab_tlr_viewer) st.stop() # ========== COMPARE MODE ========== @@ -606,7 +929,7 @@ def _highlight_diff_rows(df): st.metric("Total TP", f"{stats_b['total_tp']:,}") st.metric("Overall TP rate", f"{stats_b['overall_tp_rate']:.2%}") -tab_criteria, tab_vehicle, tab_critical, tab_details = st.tabs([ - "Criteria matrix", "Vehicle status vs TLR type", "Critical & priority zones", "Vehicle status details", +tab_criteria, tab_vehicle, tab_critical, tab_details, tab_tlr_viewer = st.tabs([ + "Criteria matrix", "Vehicle status vs TLR type", "Critical & priority zones", "Vehicle status details", "TLR viewer", ]) -_render_compare_tabs(analyzer_a, analyzer_b, label_a, label_b, tab_criteria, tab_vehicle, tab_critical, tab_details) +_render_compare_tabs(analyzer_a, analyzer_b, label_a, label_b, tab_criteria, tab_vehicle, tab_critical, tab_details, tab_tlr_viewer) diff --git a/evaluation_dashboard_app/requirements-docker.txt b/evaluation_dashboard_app/requirements-docker.txt old mode 100644 new mode 100755 index aeb7130..7486fc0 --- a/evaluation_dashboard_app/requirements-docker.txt +++ b/evaluation_dashboard_app/requirements-docker.txt @@ -4,12 +4,15 @@ streamlit>=1.30.0 pandas>=2.0.0 plotly>=5.18.0 +kaleido>=0.2.1 duckdb>=0.9.0 numpy>=1.24.0 matplotlib>=3.7.0 shapely>=2.0.0 +polars>=1.0.0 requests>=2.31.0 PyYAML>=6.0 +reportlab>=4.0.0 rq>=1.15.0 psycopg2-binary>=2.9.0 -docker>=7.0.0,<8 \ No newline at end of file +docker>=7.0.0,<8 diff --git a/evaluation_dashboard_app/requirements.txt b/evaluation_dashboard_app/requirements.txt old mode 100644 new mode 100755 index 6bbc2f5..74726e8 --- a/evaluation_dashboard_app/requirements.txt +++ b/evaluation_dashboard_app/requirements.txt @@ -1,12 +1,14 @@ streamlit>=1.30.0 pandas>=2.0.0 plotly>=5.18.0 +kaleido>=0.2.1 duckdb>=0.9.0 numpy>=1.24.0 matplotlib>=3.7.0 shapely>=2.0.0 requests>=2.31.0 PyYAML>=6.0 +reportlab>=4.0.0 # Production: task queue and metadata rq>=1.15.0 psycopg2-binary>=2.9.0 diff --git a/evaluation_dashboard_app/scripts/import_catalog_analyzer_releases.py b/evaluation_dashboard_app/scripts/import_catalog_analyzer_releases.py new file mode 100644 index 0000000..11d26e9 --- /dev/null +++ b/evaluation_dashboard_app/scripts/import_catalog_analyzer_releases.py @@ -0,0 +1,540 @@ +#!/usr/bin/env python3 +"""Import perception_catalog_analyzer release exports into dashboard data. + +This script converts release data generated directly by +perception_catalog_analyzer into the dashboard's release/trend structure. + +Expected source layout: + + perception_catalog_analyzer_output/ + export/ + / + metadata.yaml + current.parquet + future.parquet + devops.parquet + detection.yaml + pdf/ + / + / + / + metadata.yaml + summary.json + specsheet/ + specsheet.pdf + +Here is usually a joined list of evaluator job IDs, for example: + + __ + +Generated dashboard layout: + + data/ + release_spec_/ + metadata.yaml + performance/ + metadata.yaml + resources/summary.json + current.parquet + future.parquet + detection.yaml + usecase/ + metadata.yaml + resources/summary.json + devops/ + metadata.yaml + resources/summary.json + current.parquet + specsheet/ + specsheet.pdf + /specsheet.pdf + + trend_release_/ + / + / + metadata.yaml + summary.json + specsheet/specsheet.pdf + + static/ + release_specs/ + / + .pdf + +By default, large artifacts such as parquet/PDF/HTML/PNG are symlinked to avoid +duplicating very large analyzer output. Use --copy-large-artifacts when the +original analyzer output may be removed or unavailable from the server. + +Common usage: + + cd /path/to/evaluation_dashboard_app + python scripts/import_catalog_analyzer_releases.py \\ + --source /path/to/perception_catalog_analyzer_output \\ + --data-root /path/to/dashboard/data \\ + --force + +Production/server usage when source data should not remain mounted: + + python scripts/import_catalog_analyzer_releases.py \\ + --source /mnt/catalog_analyzer_output \\ + --data-root /srv/eval_dashboard/data \\ + --copy-large-artifacts \\ + --force + +After import, make sure the app serves static PDFs from static/. In this app's +Docker setup, static/ is mounted into /app/static and Streamlit static serving +is enabled. + +If the app directory is read-only on a server, either: + + - pass --static-root /writable/path/release_specs and mount that path as + /app/static/release_specs, or + - pass --skip-static-publish to import data only. PDF files are still copied + into data/release_spec_*/specsheet and data/trend_release_*/specsheet. +""" + +from __future__ import annotations + +import argparse +import json +import os +import shutil +import sys +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import yaml + + +ANALYZER_ROOT = Path("/home/leigu/Downloads/perception_catalog_analyzer_output") +MAIN_TOPIC = "perception.object_recognition.objects" +ROLE_DIR_BY_SUMMARY_ROLE = { + "full": "performance", + "usecase": "usecase", + "devops": "devops", + "performance_blocks": "performance", + "unknown": "unknown", +} +DEFAULT_PROJECT_ID = "x2_dev" +SUMMARY_FULL_HEADER = "全数データセット評価" +SUMMARY_USECASE_HEADER = "ユースケース評価" +LARGE_SUFFIXES = {".parquet", ".html", ".png"} + + +@dataclass(frozen=True) +class ImportStats: + releases: int = 0 + trend_jobs: int = 0 + role_runs: int = 0 + linked: int = 0 + copied: int = 0 + skipped: int = 0 + + def add(self, **kwargs: int) -> "ImportStats": + values = self.__dict__.copy() + for key, value in kwargs.items(): + values[key] = int(values.get(key, 0)) + value + return ImportStats(**values) + + +def _data_root() -> Path: + raw = os.environ.get("EVAL_DASHBOARD_DATA_ROOT", "data") + root = Path(raw) + if not root.is_absolute(): + root = Path.cwd() / root + root.mkdir(parents=True, exist_ok=True) + return root.resolve() + + +def _safe_path_part(value: str, fallback: str) -> str: + import re + + text = re.sub(r"[^\w.\-]+", "_", str(value or "")).strip("._") + return text or fallback + + +def _load_json(path: Path) -> dict[str, Any]: + with path.open("r", encoding="utf-8") as fh: + data = json.load(fh) + return data if isinstance(data, dict) else {} + + +def _load_yaml(path: Path) -> dict[str, Any]: + if not path.exists(): + return {} + with path.open("r", encoding="utf-8") as fh: + data = yaml.safe_load(fh) or {} + return data if isinstance(data, dict) else {} + + +def _classify_summary(summary: dict[str, Any]) -> str: + blocks = summary.get("blocks") + if isinstance(blocks, list): + headers = [str(block.get("header") or "") for block in blocks if isinstance(block, dict)] + if SUMMARY_FULL_HEADER in headers: + return "full" + if SUMMARY_USECASE_HEADER in headers: + return "usecase" + return "performance_blocks" + if summary: + return "devops" + return "unknown" + + +def _write_yaml(path: Path, payload: dict[str, Any]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as fh: + yaml.safe_dump(payload, fh, allow_unicode=True, sort_keys=False) + + +def _copy_or_link(src: Path, dst: Path, *, copy_large_artifacts: bool, force: bool) -> str: + dst.parent.mkdir(parents=True, exist_ok=True) + if dst.exists() or dst.is_symlink(): + if not force: + return "skipped" + if dst.is_dir() and not dst.is_symlink(): + shutil.rmtree(dst) + else: + dst.unlink() + + should_link = src.suffix.lower() in LARGE_SUFFIXES and not copy_large_artifacts + if should_link: + os.symlink(src.resolve(), dst) + return "linked" + shutil.copy2(src, dst) + return "copied" + + +def _publish_static_pdf(pdf_path: Path, static_pdf_path: Path, *, force: bool) -> str: + static_pdf_path.parent.mkdir(parents=True, exist_ok=True) + if static_pdf_path.exists() or static_pdf_path.is_symlink(): + if not force: + return "skipped" + static_pdf_path.unlink() + source = pdf_path.resolve() + try: + os.link(source, static_pdf_path) + except OSError: + shutil.copy2(source, static_pdf_path) + return "copied" + + +def _artifact_stat(stats: ImportStats, action: str) -> ImportStats: + if action == "linked": + return stats.add(linked=1) + if action == "copied": + return stats.add(copied=1) + if action == "skipped": + return stats.add(skipped=1) + return stats + + +def _merge_metadata(base: dict[str, Any], *, group_name: str, topic_name: str, job_id: str, role: str) -> dict[str, Any]: + evaluator_info = base.get("evaluator_info") if isinstance(base.get("evaluator_info"), dict) else {} + catalog = evaluator_info.get("catalog") if isinstance(evaluator_info.get("catalog"), dict) else {} + source = evaluator_info.get("event", {}).get("source", {}) if isinstance(evaluator_info.get("event"), dict) else {} + project_id = str(base.get("project_id") or DEFAULT_PROJECT_ID).strip() + merged = { + key: base.get(key) + for key in ( + "tags", + "pilot_auto_version", + "version_abbr", + "data_count", + "description", + "date", + ) + if base.get(key) not in (None, "") + } + if catalog: + merged["catalog_display_name"] = catalog.get("display_name") + merged["catalog_id"] = catalog.get("id") + merged["catalog_version_id"] = catalog.get("version_id") + if isinstance(source, dict): + for key in ("git_commit_url", "git_ref", "git_commit_date"): + if source.get(key): + merged[key] = source.get(key) + merged["release_group"] = group_name + merged["topic_name"] = topic_name + merged["job_id"] = job_id + merged["project_id"] = project_id + merged["role"] = role + merged["imported_from"] = str(ANALYZER_ROOT) + return merged + + +def _copy_export_job( + export_root: Path, + job_id: str, + target_dir: Path, + *, + group_name: str, + topic_name: str, + role: str, + copy_large_artifacts: bool, + force: bool, + stats: ImportStats, +) -> ImportStats: + source_dir = export_root / job_id + if not source_dir.is_dir(): + return stats + + source_metadata = _load_yaml(source_dir / "metadata.yaml") + metadata = _merge_metadata( + source_metadata, + group_name=group_name, + topic_name=topic_name, + job_id=job_id, + role=role, + ) + _write_yaml(target_dir / "metadata.yaml", metadata) + stats = stats.add(copied=1) + + for file_name in ("current.parquet", "future.parquet", "devops.parquet", "detection.yaml"): + src = source_dir / file_name + if not src.exists(): + continue + action = _copy_or_link(src, target_dir / file_name, copy_large_artifacts=copy_large_artifacts, force=force) + stats = _artifact_stat(stats, action) + return stats + + +def _copy_summary_job( + job_dir: Path, + target_dir: Path, + *, + group_name: str, + topic_name: str, + job_id: str, + role: str, + force: bool, + stats: ImportStats, +) -> ImportStats: + resources = target_dir / "resources" + resources.mkdir(parents=True, exist_ok=True) + metadata = _load_yaml(job_dir / "metadata.yaml") + metadata = _merge_metadata(metadata, group_name=group_name, topic_name=topic_name, job_id=job_id, role=role) + _write_yaml(resources / "metadata.yaml", metadata) + stats = stats.add(copied=1) + + for src, dst in ( + (job_dir / "summary.json", resources / "summary.json"), + (job_dir / "summary.json", target_dir / "summary.json"), + ): + if src.exists(): + action = _copy_or_link(src, dst, copy_large_artifacts=True, force=force) + stats = _artifact_stat(stats, action) + return stats + + +def import_releases( + analyzer_root: Path, + data_root: Path, + *, + static_root: Path | None, + copy_large_artifacts: bool, + force: bool, +) -> ImportStats: + export_root = analyzer_root / "export" + pdf_root = analyzer_root / "pdf" + stats = ImportStats() + + if not export_root.is_dir() or not pdf_root.is_dir(): + raise FileNotFoundError(f"Expected export/ and pdf/ under {analyzer_root}") + + if static_root is not None: + try: + static_root.mkdir(parents=True, exist_ok=True) + except PermissionError as exc: + print( + f"Warning: cannot write static PDF directory {static_root}: {exc}. " + "Continuing without static PDF publishing. Use --static-root with a writable path, " + "fix directory ownership, or pass --skip-static-publish.", + file=sys.stderr, + ) + static_root = None + + for pdf_group_dir in sorted(path for path in pdf_root.iterdir() if path.is_dir()): + group_name = pdf_group_dir.name + release_dir = data_root / f"release_spec_{_safe_path_part(group_name, 'release')}" + trend_dir = data_root / f"trend_release_{_safe_path_part(group_name, 'release')}" + release_dir.mkdir(parents=True, exist_ok=True) + stats = stats.add(releases=1) + + release_metadata_written = False + for topic_dir in sorted(path for path in pdf_group_dir.iterdir() if path.is_dir()): + topic_name = topic_dir.name + topic_safe = _safe_path_part(topic_name, "topic") + trend_topic_dir = trend_dir / topic_name + trend_topic_dir.mkdir(parents=True, exist_ok=True) + + specsheet_pdf = topic_dir / "specsheet" / "specsheet.pdf" + if specsheet_pdf.exists(): + action = _copy_or_link( + specsheet_pdf, + release_dir / "specsheet" / topic_safe / "specsheet.pdf", + copy_large_artifacts=copy_large_artifacts, + force=force, + ) + stats = _artifact_stat(stats, action) + action = _copy_or_link( + specsheet_pdf, + trend_topic_dir / "specsheet" / "specsheet.pdf", + copy_large_artifacts=copy_large_artifacts, + force=force, + ) + stats = _artifact_stat(stats, action) + if topic_name == MAIN_TOPIC: + action = _copy_or_link( + specsheet_pdf, + release_dir / "specsheet" / "specsheet.pdf", + copy_large_artifacts=copy_large_artifacts, + force=force, + ) + stats = _artifact_stat(stats, action) + if static_root is not None: + static_pdf_path = ( + static_root + / _safe_path_part(group_name, "release") + / f"{_safe_path_part(topic_name, 'topic')}.pdf" + ) + action = _publish_static_pdf(specsheet_pdf, static_pdf_path, force=force) + stats = _artifact_stat(stats, action) + + for job_dir in sorted(path for path in topic_dir.iterdir() if path.is_dir()): + if job_dir.name in {"trend", "specsheet"}: + continue + summary_path = job_dir / "summary.json" + if not summary_path.exists(): + continue + job_id = job_dir.name + role = _classify_summary(_load_json(summary_path)) + role_dir_name = ROLE_DIR_BY_SUMMARY_ROLE.get(role, role) + + trend_job_dir = trend_topic_dir / job_id + trend_job_dir.mkdir(parents=True, exist_ok=True) + for src_name in ("summary.json", "metadata.yaml"): + src = job_dir / src_name + if not src.exists(): + continue + if src_name == "metadata.yaml": + metadata = _merge_metadata( + _load_yaml(src), + group_name=group_name, + topic_name=topic_name, + job_id=job_id, + role=role, + ) + _write_yaml(trend_job_dir / src_name, metadata) + stats = stats.add(copied=1) + else: + action = _copy_or_link(src, trend_job_dir / src_name, copy_large_artifacts=True, force=force) + stats = _artifact_stat(stats, action) + stats = stats.add(trend_jobs=1) + + if topic_name != MAIN_TOPIC: + continue + role_dir = release_dir / role_dir_name + stats = _copy_export_job( + export_root, + job_id, + role_dir, + group_name=group_name, + topic_name=topic_name, + role=role, + copy_large_artifacts=copy_large_artifacts, + force=force, + stats=stats, + ) + stats = _copy_summary_job( + job_dir, + role_dir, + group_name=group_name, + topic_name=topic_name, + job_id=job_id, + role=role, + force=force, + stats=stats, + ) + stats = stats.add(role_runs=1) + + if not release_metadata_written and role in {"full", "performance_blocks"}: + metadata = _merge_metadata( + _load_yaml(job_dir / "metadata.yaml"), + group_name=group_name, + topic_name=topic_name, + job_id=job_id, + role=role, + ) + _write_yaml(release_dir / "metadata.yaml", metadata) + release_metadata_written = True + stats = stats.add(copied=1) + + if not release_metadata_written: + _write_yaml(release_dir / "metadata.yaml", {"release_group": group_name, "imported_from": str(analyzer_root)}) + stats = stats.add(copied=1) + + return stats + + +def main() -> int: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--source", + type=Path, + default=ANALYZER_ROOT, + help="Analyzer output root containing export/ and pdf/. Default: %(default)s", + ) + parser.add_argument( + "--data-root", + type=Path, + default=None, + help="Dashboard data root. Defaults to EVAL_DASHBOARD_DATA_ROOT or ./data.", + ) + parser.add_argument( + "--copy-large-artifacts", + action="store_true", + help=( + "Copy parquet/PDF/PNG/HTML instead of symlinking them. Use this on servers " + "when the original analyzer output will not stay mounted." + ), + ) + parser.add_argument( + "--static-root", + type=Path, + default=None, + help=( + "Directory for static PDF copies. Defaults to ./static/release_specs. " + "Use a writable path on servers and mount it as /app/static/release_specs." + ), + ) + parser.add_argument( + "--skip-static-publish", + action="store_true", + help="Do not write static/release_specs PDF copies. Data/specsheet PDFs are still imported.", + ) + parser.add_argument("--force", action="store_true", help="Replace existing imported files and links.") + args = parser.parse_args() + + data_root = args.data_root.resolve() if args.data_root is not None else _data_root() + static_root = None + if not args.skip_static_publish: + static_root = (args.static_root if args.static_root is not None else Path.cwd() / "static" / "release_specs").resolve() + stats = import_releases( + args.source.resolve(), + data_root, + static_root=static_root, + copy_large_artifacts=args.copy_large_artifacts, + force=args.force, + ) + print(json.dumps(stats.__dict__, indent=2, ensure_ascii=False)) + print(f"Imported analyzer releases into {data_root}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/evaluation_dashboard_app/slides/evaluation_dashboard_intro_ja.md b/evaluation_dashboard_app/slides/evaluation_dashboard_intro_ja.md new file mode 100644 index 0000000..a9e3ea5 --- /dev/null +++ b/evaluation_dashboard_app/slides/evaluation_dashboard_intro_ja.md @@ -0,0 +1,202 @@ +--- +marp: true +theme: default +paginate: true +size: 16:9 +--- + +# 評価業務を回す統合ダッシュボード + +### Perception Evaluation Dashboard 紹介 + +- 発表者: (お名前) +- 日付: 2026-04-22 + +--- + +## 今日お話しすること + +1. 背景: 何が課題だったか +2. システム全体像と基本導線 +3. 主要機能の紹介 +4. 技術構成(運用アーキテクチャ) +5. 価値と今後の展開 + +--- + +## 背景: 何が困っていたのか + +- 評価結果の取得、整形、可視化、比較、共有が分断 +- ツール間移動や手作業が多く、時間がかかる +- 比較条件が揃わず、議論が噛み合わない +- 属人化しやすく、再現性が下がる + +--- + +## このシステムの狙い + +**評価業務の一連の流れを一つの場所で回すこと** + +- 取得 +- 整形(Summary / Score / parquet) +- 分析(単体・比較) +- 共有 +- データ管理 + +> 「見るための道具」ではなく「評価業務の基盤」 + +--- + +## システム全体像 + +- Download: 結果・シナリオの取得 +- Eval Results: CSV/評価データ生成 +- Overview: Run/比較条件の統一 +- 各分析ページ: 観点別の深掘り +- Data Management: 共有・整理・運用 + +--- + +## 典型的なユーザー導線(3ステップ) + +1. Downloadで対象結果を取得 +2. Summary.csv / Score.csv を生成 +3. OverviewでRun選択 → 分析ページへ + +**効果:** 作業切替が減り、初心者でも入りやすい + +--- + +## Overviewの役割(ハブ) + +- Single / Compare モード切替 +- Baseline / Candidate の比較前提を統一 +- Perception Label / Product Label で共通フィルタ +- 共有URLで表示状態を再現 + +**ポイント:** 前提を揃えて議論のズレを防ぐ + +--- + +## Downloadの価値 + +- ダウンロード前後の作業を一体化 +- 取得だけでなく、後続分析で使える形まで整備 +- 重い処理はタスク化し、進捗を可視化 + +**運用効果:** 属人化の低減、日常業務の安定化 + +--- + +## TP Summary / Criteria Based Score + +### TP Summary +- Summary.csvベースの全体傾向把握 +- 平均だけでなく分布や外れ値を確認 + +### Criteria Based Score +- Score.csvベースの基準別評価 +- しきい値(ゲート)を使った合否判断に有効 + +--- + +## Detection Stats + +- parquet + DuckDBで詳細分析 +- TP/FPなどの状態別・距離ビン別比較 +- 全体値では見えない偏りを発見 + +**使いどころ:** 「差がある」から「どこで差がある」へ + +--- + +## Bounding Box Viewer + +- BEV上でバウンディングボックスを可視化 +- topic / label / visibility で絞り込み +- Compareで挙動差を視覚的に確認 + +**意義:** 数値の裏にある実体を理解する + +--- + +## TLR Analysis + +- 信号認識評価に特化 +- criteriaマトリクス、車両状態×信号種別、zone分析 +- 比較時は差分ヒートマップで把握 + +**強み:** ドメイン特化で弱点を構造的に把握 + +--- + +## Prediction Evaluation + +- minADE / minFDEなどを距離・方向・ラベルで分解 +- リング表示などで偏りを直感的に把握 + +**意義:** 全体平均だけでなく改善対象を特定できる + +--- + +## Data Management + +- Run一覧(サイズ/更新日時/成果物有無)を可視化 +- ZIP化、共有リンク生成、不要Run削除 + +**実運用で重要:** 分析品質を落とさない整理整頓 + +--- + +## Help / Debugページ + +- Help: アプリ内で使い方を参照 +- Parquet Debug: スキーマやデータ切り分け +- Deployment Debug: Postgres / Redis / Worker状態確認 + +**設計思想:** 使う人だけでなく支える人の導線も用意 + +--- + +## 技術構成(フロント〜バック) + +- フロント: Streamlit +- 非同期: Redis + Worker +- 状態管理: Postgres +- 配置: Nginx配下に複数Streamlit + Worker群 + +**狙い:** UIを軽く保ち、重い処理はバックグラウンド化 + +--- + +## このアーキテクチャのメリット + +- 重い処理で画面が固まりにくい +- 複数人運用でも影響を分離しやすい +- 障害切り分けがしやすい +- スケールしやすい(UIと処理を分離) + +--- + +## まとめ + +このシステムは、 + +- 取得 +- 整形 +- 分析 +- 共有 +- 運用 + +を一体化した**評価業務の基盤**です。 + +> **評価を見える化するだけでなく、回せるようにするシステム** + +--- + +## Q&A + +- ご質問はこのセクションへご記入ください +- 時間内に回答しきれない場合は後ほどフォローします + +ありがとうございました。 diff --git a/evaluation_dashboard_app/tests/conftest.py b/evaluation_dashboard_app/tests/conftest.py new file mode 100644 index 0000000..e24af09 --- /dev/null +++ b/evaluation_dashboard_app/tests/conftest.py @@ -0,0 +1,8 @@ +"""Pytest configuration for evaluation_dashboard_app tests.""" + + +def pytest_configure(config): + config.addinivalue_line( + "markers", + "integration: tests that require a live service or network (opt-in)", + ) diff --git a/evaluation_dashboard_app/tests/test_t4_visualizer_client.py b/evaluation_dashboard_app/tests/test_t4_visualizer_client.py new file mode 100644 index 0000000..26aabc6 --- /dev/null +++ b/evaluation_dashboard_app/tests/test_t4_visualizer_client.py @@ -0,0 +1,200 @@ +"""Tests for lib/t4_visualizer_client.py. + +Unit tests use mocks (no network). Optional integration tests call a live server when +``T4_VISUALIZER_BASE_URL`` points at a reachable instance (e.g. ``t4-server``); they +skip if the server is down. +""" + +from __future__ import annotations + +import base64 +import os +from unittest.mock import MagicMock + +import pytest + +from lib.t4_visualizer_client import ( + ENV_BASE_URL, + RenderRequest, + T4VisualizerClient, + T4VisualizerError, + TargetObjectIn, + target_object_from_gt_row, +) + + +# Minimal valid 1x1 PNG (transparent pixel) +_TINY_PNG_BYTES = ( + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01" + b"\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\x00\x01" + b"\x00\x00\x05\x00\x01\r\n-\xdb\x00\x00\x00\x00IEND\xaeB`\x82" +) +_TINY_PNG_B64 = base64.b64encode(_TINY_PNG_BYTES).decode("ascii") + + +def _ok_response(json_data): + r = MagicMock() + r.ok = True + r.status_code = 200 + r.text = "" + r.json.return_value = json_data + return r + + +def _err_response(status_code: int, text: str = "not found"): + r = MagicMock() + r.ok = False + r.status_code = status_code + r.text = text + return r + + +def test_health_success(): + session = MagicMock() + session.get.return_value = _ok_response({"status": "ok"}) + c = T4VisualizerClient(base_url="http://test:9999", session=session) + assert c.health() == {"status": "ok"} + session.get.assert_called_once() + assert "health" in session.get.call_args[0][0] + + +def test_list_datasets_success(): + session = MagicMock() + session.get.return_value = _ok_response( + {"data_dir": "/data", "datasets": ["ds_a", "ds_b"]} + ) + c = T4VisualizerClient(base_url="http://test", session=session) + d = c.list_datasets() + assert d["datasets"] == ["ds_a", "ds_b"] + assert d["data_dir"] == "/data" + + +def test_list_dataset_scenarios_success(): + session = MagicMock() + session.get.return_value = _ok_response( + { + "t4dataset_id": "ds1", + "scenarios": [ + { + "name": "scene-a", + "token": "tok", + "description": "", + "nbr_samples": 42, + } + ], + "version": None, + } + ) + c = T4VisualizerClient(base_url="http://test", session=session) + out = c.list_dataset_scenarios("ds1") + assert out["t4dataset_id"] == "ds1" + assert len(out["scenarios"]) == 1 + assert out["scenarios"][0]["name"] == "scene-a" + assert out["scenarios"][0]["nbr_samples"] == 42 + session.get.assert_called_once() + call_url = session.get.call_args[0][0] + assert "ds1" in call_url and "scenarios" in call_url + + +def test_render_success_decode(): + session = MagicMock() + session.post.return_value = _ok_response( + { + "sample_token": "tok1", + "timestamp_us": 1234567890000000, + "images": [{"label": "CAM_FRONT", "png_base64": _TINY_PNG_B64}], + } + ) + c = T4VisualizerClient(base_url="http://test", session=session) + req = RenderRequest( + t4dataset_id="ds1", + scenario_name="scene-1", + frame_index=0, + target_objects=[TargetObjectIn(uuid="u1", x=1.0, y=2.0, z=0.5, label="car")], + ) + out = c.render(req) + assert out.sample_token == "tok1" + assert out.timestamp_us == 1234567890000000 + assert len(out.images) == 1 + raw = out.decode_png("CAM_FRONT") + assert raw == _TINY_PNG_BYTES + all_pairs = out.decode_all_images() + assert all_pairs == [("CAM_FRONT", _TINY_PNG_BYTES)] + + +def test_render_http_error(): + session = MagicMock() + session.post.return_value = _err_response(404, "Dataset 'x' not found") + c = T4VisualizerClient(base_url="http://test", session=session) + req = RenderRequest(t4dataset_id="x", scenario_name="s", frame_index=0) + with pytest.raises(T4VisualizerError) as ei: + c.render(req) + assert ei.value.status_code == 404 + assert "404" in str(ei.value) or "not found" in ei.value.response_text.lower() + + +def test_render_invalid_json_body(): + session = MagicMock() + r = MagicMock() + r.ok = True + r.status_code = 200 + r.json.side_effect = ValueError("bad json") + session.post.return_value = r + c = T4VisualizerClient(base_url="http://test", session=session) + with pytest.raises(T4VisualizerError, match="Invalid JSON"): + c.render(RenderRequest(t4dataset_id="a", scenario_name="b", frame_index=0)) + + +def test_target_object_from_gt_row_full(): + row = { + "uuid": "abc-123", + "x": 10.5, + "y": -2.0, + "z": 0.1, + "label": "pedestrian", + "width": 0.5, + "length": 0.6, + "height": 1.7, + "yaw": 0.25, + } + d = target_object_from_gt_row(row) + assert d["uuid"] == "abc-123" + assert d["x"] == 10.5 + assert d["y"] == -2.0 + assert d["z"] == 0.1 + assert d["label"] == "pedestrian" + assert d["width"] == 0.5 + assert d["length"] == 0.6 + assert d["height"] == 1.7 + assert d["yaw"] == 0.25 + + +def test_target_object_from_gt_row_gt_uuid_partial(): + row = {"gt_uuid": "g1", "x": 1.0, "y": 2.0, "label": "car"} + d = target_object_from_gt_row(row) + assert d["uuid"] == "g1" + assert d["z"] == 0.0 + assert d["width"] == 0.0 + assert d["length"] == 0.0 + assert d["height"] == 0.0 + assert d["yaw"] == 0.0 + + +def test_target_object_from_gt_row_uuid_precedence(): + row = {"uuid": "u", "gt_uuid": "g", "x": 0, "y": 0} + d = target_object_from_gt_row(row) + assert d["uuid"] == "u" + + +@pytest.mark.integration +def test_live_health_if_configured(): + """Skips unless T4_VISUALIZER_BASE_URL is set and server responds.""" + base = os.environ.get(ENV_BASE_URL) + if not base: + pytest.skip(f"Set {ENV_BASE_URL} to run integration test against a live server") + client = T4VisualizerClient(base_url=base, timeout=5.0) + try: + h = client.health() + except (T4VisualizerError, OSError) as e: + pytest.skip(f"Server not reachable: {e}") + assert h.get("status") == "ok" diff --git a/evaluation_dashboard_app/worker/run_worker.py b/evaluation_dashboard_app/worker/run_worker.py index 4a4b8d2..f98a2fd 100644 --- a/evaluation_dashboard_app/worker/run_worker.py +++ b/evaluation_dashboard_app/worker/run_worker.py @@ -6,11 +6,13 @@ import os import sys +import faulthandler _APP_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if _APP_ROOT not in sys.path: sys.path.insert(0, _APP_ROOT) os.chdir(_APP_ROOT) +faulthandler.enable(all_threads=True) def main(): from rq import Worker diff --git a/evaluation_dashboard_app/worker/tasks.py b/evaluation_dashboard_app/worker/tasks.py index a27cb80..c72f75c 100644 --- a/evaluation_dashboard_app/worker/tasks.py +++ b/evaluation_dashboard_app/worker/tasks.py @@ -5,15 +5,42 @@ import os import re +import json +import shutil import sys -from typing import Any, Dict +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Any, Dict, Optional + +import yaml # App root on path for lib imports _APP_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) if _APP_ROOT not in sys.path: sys.path.insert(0, _APP_ROOT) -from lib.db import update_task_status, update_task_progress, append_task_log, update_task_result_summary +from lib.db import ( + append_task_log, + get_task, + update_task_progress, + update_task_result_summary, + update_task_status, +) +from lib.run_metadata import ( + read_run_metadata, + resolve_run_directory_from_task_parameters, + upsert_run_metadata, +) +from lib.specsheet_report import write_trend_metadata + +_RELEASE_PERFORMANCE_CATALOG_ID = "e36d75b9-6c3a-4970-9b9b-5cd13f7a9da3" +_RELEASE_PERFORMANCE_INTEGRATION_ID = "96ad8fba-0228-4c2b-9166-07d4de1a0760" +_RELEASE_DEVOPS_CATALOG_ID = "ab0f8498-cc1b-4726-836f-e18e8bcb3200" +_RELEASE_DEVOPS_INTEGRATION_ID = "295cff78-9bc9-4d60-b7aa-f95be6ff96a4" +_RELEASE_OPTIONAL_CATALOG_ID = "09039022-ec91-41bf-9e93-fdefccdfc9bc" +_RELEASE_SKIP_LARGE_FILE = True +_RELEASE_LARGE_FILE_MB = 50.0 # Optional imports for tasks that need them def _import_eval_summary(): @@ -28,14 +55,330 @@ def _import_catalog_io(): return None +def _parquet_progress_callback( + task_id: str, + *, + prefix: str = "Parquet", + pct_start: float = 0.0, + pct_end: float = 100.0, +): + """Return a pkl-file progress callback for pkl_archive_to_parquet.""" + + def _on_progress(done: int, total: int) -> None: + total_safe = max(1, int(total or 0)) + done_safe = min(max(0, int(done or 0)), total_safe) + pct = pct_start + (done_safe / total_safe) * max(0.0, pct_end - pct_start) + message = f"{prefix}: processing pkl files {done_safe}/{total_safe}" + update_task_progress(task_id, message=message, pct=min(pct_end, pct)) + append_task_log(task_id, message) + + return _on_progress + + +def _eval_worker_count(parameters: Dict[str, Any], total: int) -> int: + """Resolve bounded eval concurrency. Defaults to 4, capped by total dirs.""" + if total <= 0: + return 1 + raw = parameters.get("eval_workers", os.environ.get("EVAL_WORKERS_DEFAULT", 4)) + try: + workers = int(raw) + except (TypeError, ValueError): + workers = 4 + try: + max_workers = int(os.environ.get("EVAL_WORKERS_MAX", 16)) + except ValueError: + max_workers = 16 + return max(1, min(workers, max_workers, total)) + + +def _compact_eval_path(path: Any, *, parts: int = 2) -> str: + """Return a readable tail path for task logs without flooding the UI.""" + text = str(path or "").strip() + if not text: + return "unknown" + try: + p = Path(text) + tail = p.parts[-parts:] + return "/".join(tail) if tail else text + except Exception: + return text + + +def _run_eval_result_dirs( + *, + task_id: str, + eval_summary: Any, + target_dirs: list[str], + overwrite: bool, + eval_workers: int, + pct_start: float, + pct_end: float, + label: str = "Eval", +) -> list[Dict[str, Any]]: + """Run eval_result across result dirs with bounded concurrency and calm progress.""" + total = len(target_dirs) + if total <= 0: + update_task_progress(task_id, message=f"{label}: no result directories found", pct=pct_end) + return [] + + workers = max(1, min(int(eval_workers or 1), total)) + span = max(0.0, pct_end - pct_start) + statuses: list[Dict[str, Any]] = [] + counts = {"success": 0, "skipped": 0, "failed": 0} + + def _record(status: Dict[str, Any]) -> str: + statuses.append(status) + state = str(status.get("status") or "failed") + if state not in counts: + state = "failed" + counts[state] += 1 + if state == "failed": + append_task_log( + task_id, + f"{label}: eval failed for {status.get('path', '')}: {status.get('detail', '')}", + ) + return state + + def _progress(done: int, latest: str | None = None) -> None: + pct = pct_start + (done / total) * span + latest_text = f" latest: {latest}" if latest else "" + update_task_progress( + task_id, + message=( + f"{label}: completed {done}/{total} dirs " + f"(success {counts['success']}, skipped {counts['skipped']}, failed {counts['failed']})" + f"{latest_text}" + ), + pct=min(pct_end, pct), + ) + + append_task_log(task_id, f"{label}: running eval_result for {total} directories with {workers} worker(s)") + _progress(0) + + if workers == 1: + for i, result_dir in enumerate(target_dirs, start=1): + append_task_log(task_id, f"{label}: starting {i}/{total}: {result_dir}") + status = eval_summary.run_eval_result_for_dir(result_dir, overwrite=overwrite) + state = _record(status) + short_path = _compact_eval_path(status.get("path") or result_dir) + append_task_log(task_id, f"{label}: {i}/{total} {state}: {short_path}") + _progress(i, short_path) + return statuses + + with ThreadPoolExecutor(max_workers=workers) as executor: + future_map = { + executor.submit(eval_summary.run_eval_result_for_dir, result_dir, overwrite=overwrite): result_dir + for result_dir in target_dirs + } + for done, future in enumerate(as_completed(future_map), start=1): + result_dir = future_map[future] + try: + status = future.result() + except Exception as exc: + status = {"path": result_dir, "status": "failed", "detail": str(exc)} + state = _record(status) + short_path = _compact_eval_path(status.get("path") or result_dir) + append_task_log(task_id, f"{label}: {done}/{total} {state}: {short_path}") + _progress(done, short_path) + return statuses + + +def _copy_task_parameters(parameters: Dict[str, Any]) -> Dict[str, Any]: + copied: Dict[str, Any] = {} + for key, value in (parameters or {}).items(): + if isinstance(value, (dict, list, tuple, str, int, float, bool)) or value is None: + copied[key] = value + else: + copied[key] = str(value) + return copied + + +def _resolve_active_integration_id(api: Any, project_id: str, catalog_id: str) -> str: + """Resolve latest active integration for a catalog when the UI only provided a catalog id.""" + url = f"{api.api_base_url}/projects/{project_id}/integrations" + response = api.request(url, {"catalog_id": catalog_id, "size": 100}, method="GET") + if response is None: + raise RuntimeError(f"No response returned while loading integrations for catalog {catalog_id}.") + if response.status_code != 200: + raise RuntimeError( + f"Failed to load integrations for catalog {catalog_id}: status={response.status_code}" + ) + payload = json.loads(response.content) + integrations = payload.get("integrations", []) or [] + active = [ + item for item in integrations + if isinstance(item, dict) + and str(item.get("catalog_id") or "").strip() == catalog_id + and not bool(item.get("deleted")) + ] + if not active: + raise RuntimeError(f"No active integration found for catalog {catalog_id}.") + + def _sort_key(item: Dict[str, object]) -> tuple: + return ( + str(item.get("updated_at") or ""), + int(item.get("version_id") or 0), + str(item.get("id") or ""), + ) + + active.sort(key=_sort_key, reverse=True) + return str(active[0].get("id") or "").strip() + + +def _task_row_payload(task_id: str) -> Dict[str, Any]: + row = get_task(task_id) or {} + return { + "id": str(row.get("id") or task_id), + "type": str(row.get("type") or "").strip(), + "status": str(row.get("status") or "").strip(), + "requested_by": str(row.get("session_id") or "").strip(), + "created_at": row.get("created_at"), + "updated_at": row.get("updated_at"), + "result_path": str(row.get("result_path") or "").strip(), + "error_message": str(row.get("error_message") or "").strip(), + "progress_message": str(row.get("progress_message") or "").strip(), + "progress_pct": row.get("progress_pct"), + } + + +def _task_request_payload(parameters: Dict[str, Any]) -> Dict[str, Any]: + params = _copy_task_parameters(parameters) + return { + "environment": str(params.get("environment") or "default").strip() or "default", + "project_id": str(params.get("project_id") or "").strip(), + "job_id": str(params.get("job_id") or "").strip(), + "catalog_id": str(params.get("catalog_id") or "").strip(), + "integration_id": str(params.get("integration_id") or "").strip(), + "source_job_id": str(params.get("source_job_id") or "").strip(), + "target_name": str(params.get("target_name") or "").strip(), + "description": str(params.get("description") or "").strip(), + "suite_id": str(params.get("suite_id") or "").strip(), + "suite_ids": list(params.get("suite_ids") or []), + "download_type": str(params.get("download_type") or "").strip(), + "phase": str(params.get("phase") or "").strip(), + "skip_large_file": bool(params.get("skip_large_file", False)), + "large_file_mb": params.get("large_file_mb"), + "keep_zip_files": bool(params.get("keep_zip_files", False)), + "run_eval": bool(params.get("run_eval", False)), + "generate_parquet": bool(params.get("generate_parquet", False)), + "eval_recursive": bool(params.get("eval_recursive", False)), + "eval_overwrite": bool(params.get("eval_overwrite", False)), + "max_retries": params.get("max_retries"), + "clean_build": bool(params.get("clean_build", False)), + "debug": bool(params.get("debug", False)), + "is_tag": bool(params.get("is_tag", False)), + "scenario_name_filter": str(params.get("scenario_name_filter") or "").strip(), + "selected_ids": list(params.get("selected_ids") or []), + "output_path": str( + params.get("output_path") + or params.get("output_dir") + or params.get("eval_root") + or params.get("pkl_dir") + or "" + ).strip(), + "parameters": params, + } + + +def _build_run_metadata_patch(task_id: str, parameters: Dict[str, Any], *, task_type: str) -> Dict[str, Any]: + return { + "source_mode": task_type, + "task": _task_row_payload(task_id), + "request": _task_request_payload(parameters), + } + + +def _update_run_metadata( + task_id: str, + parameters: Dict[str, Any], + *, + task_type: str, + create_missing: bool = False, + extra: Optional[Dict[str, Any]] = None, +) -> None: + run_dir = resolve_run_directory_from_task_parameters(parameters, create_missing=create_missing) + if run_dir is None: + return + patch = _build_run_metadata_patch(task_id, parameters, task_type=task_type) + if extra: + patch.update(extra) + try: + upsert_run_metadata(run_dir, patch, create_missing=create_missing) + except Exception: + pass + + +def _append_run_event( + task_id: str, + parameters: Dict[str, Any], + *, + task_type: str, + message: str, +) -> None: + run_dir = resolve_run_directory_from_task_parameters(parameters, create_missing=False) + if run_dir is None: + return + try: + metadata = read_run_metadata(run_dir) + events = list(metadata.get("events") or []) + events.append({"at": _task_row_payload(task_id).get("updated_at"), "message": message}) + if len(events) > 50: + events = events[-50:] + upsert_run_metadata( + run_dir, + { + "events": events, + "task": _task_row_payload(task_id), + }, + create_missing=False, + ) + except Exception: + pass + + +def _mark_run_status( + task_id: str, + parameters: Dict[str, Any], + *, + task_type: str, + status: str, + error_message: str = "", + result_path: str = "", + extra: Optional[Dict[str, Any]] = None, + create_missing: bool = False, +) -> None: + patch: Dict[str, Any] = { + "task": { + "status": status, + } + } + if error_message: + patch["task"]["error_message"] = error_message + if result_path: + patch["task"]["result_path"] = result_path + if extra: + patch.update(extra) + _update_run_metadata( + task_id, + parameters, + task_type=task_type, + create_missing=create_missing, + extra=patch, + ) + + def job_generate_summary_csv(task_id: str, parameters: Dict[str, Any]) -> None: """Generate Summary.csv and Score.csv under eval_root.""" update_task_status(task_id, "running") append_task_log(task_id, "Starting generate_summary_csv") + _mark_run_status(task_id, parameters, task_type="generate_summary_csv", status="running") try: eval_summary = _import_eval_summary() eval_root = parameters.get("eval_root") if not eval_root: + _mark_run_status( + task_id, parameters, task_type="generate_summary_csv", status="failed", error_message="Missing eval_root" + ) update_task_status(task_id, "failed", error_message="Missing eval_root") return append_task_log(task_id, f"Generating summary under {eval_root}") @@ -50,10 +393,28 @@ def job_generate_summary_csv(task_id: str, parameters: Dict[str, Any]) -> None: "score_rows": info.get("score_rows", 0), }, ) + _update_run_metadata( + task_id, + parameters, + task_type="generate_summary_csv", + extra={ + "evaluation": { + "summary_path": result_path, + "summary_rows": info.get("summary_rows", 0), + "score_rows": info.get("score_rows", 0), + } + }, + ) append_task_log(task_id, f"Done. Output: {result_path}") + _mark_run_status( + task_id, parameters, task_type="generate_summary_csv", status="completed", result_path=str(result_path or "") + ) update_task_status(task_id, "completed", result_path=result_path) except Exception as e: append_task_log(task_id, f"Failed: {e}") + _mark_run_status( + task_id, parameters, task_type="generate_summary_csv", status="failed", error_message=str(e) + ) update_task_status(task_id, "failed", error_message=str(e)) raise @@ -62,40 +423,76 @@ def job_run_eval_dirs(task_id: str, parameters: Dict[str, Any]) -> None: """Run eval_result for each dir under eval_root, then generate Summary/Score CSV.""" update_task_status(task_id, "running") append_task_log(task_id, "Starting run_eval_dirs") + _mark_run_status(task_id, parameters, task_type="run_eval_dirs", status="running") try: eval_summary = _import_eval_summary() eval_root = parameters.get("eval_root") recursive = parameters.get("recursive", True) overwrite = parameters.get("overwrite", False) if not eval_root: + _mark_run_status(task_id, parameters, task_type="run_eval_dirs", status="failed", error_message="Missing eval_root") update_task_status(task_id, "failed", error_message="Missing eval_root") return target_dirs = eval_summary.find_eval_result_dirs(eval_root, recursive=recursive) if not target_dirs: + _mark_run_status( + task_id, parameters, task_type="run_eval_dirs", status="failed", error_message="No result directories found" + ) update_task_status(task_id, "failed", error_message="No result directories found") return total = len(target_dirs) - append_task_log(task_id, f"Processing {total} directories") - for i, result_dir in enumerate(target_dirs): - pct = 100.0 * (i + 1) / total if total else 0 - update_task_progress(task_id, message=f"Processing {i+1}/{total}: {result_dir}", pct=pct) - append_task_log(task_id, f"Processing {i+1}/{total}: {result_dir}") - eval_summary.run_eval_result_for_dir(result_dir, overwrite=overwrite) + eval_workers = _eval_worker_count(parameters, total) + statuses = _run_eval_result_dirs( + task_id=task_id, + eval_summary=eval_summary, + target_dirs=target_dirs, + overwrite=overwrite, + eval_workers=eval_workers, + pct_start=0.0, + pct_end=90.0, + label="Eval", + ) append_task_log(task_id, "Generating summary CSV") + update_task_progress(task_id, message="Generating Summary.csv / Score.csv", pct=95) info = eval_summary.generate_summary_and_score_csv(eval_root) result_path = info.get("summary_path", eval_root) + failed = [s for s in statuses if s.get("status") == "failed"] + skipped = [s for s in statuses if s.get("status") == "skipped"] + succeeded = [s for s in statuses if s.get("status") == "success"] summary = { "job": "run_eval_dirs", "directories_processed": total, + "success": len(succeeded), + "failed": len(failed), + "skipped": len(skipped), "summary_path": result_path, "summary_rows": info.get("summary_rows", 0), "score_rows": info.get("score_rows", 0), } update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="run_eval_dirs", + extra={ + "evaluation": { + "directories_processed": total, + "success": len(succeeded), + "failed": len(failed), + "skipped": len(skipped), + "summary_path": result_path, + "summary_rows": info.get("summary_rows", 0), + "score_rows": info.get("score_rows", 0), + } + }, + ) append_task_log(task_id, f"Done. Output: {result_path}") + update_task_progress(task_id, message="Eval complete", pct=100) + _mark_run_status(task_id, parameters, task_type="run_eval_dirs", status="completed", result_path=result_path) update_task_status(task_id, "completed", result_path=result_path) except Exception as e: append_task_log(task_id, f"Failed: {e}") + _mark_run_status(task_id, parameters, task_type="run_eval_dirs", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) raise @@ -104,30 +501,50 @@ def job_build_parquet(task_id: str, parameters: Dict[str, Any]) -> None: """Build scene_result parquet from pkl directory.""" update_task_status(task_id, "running") append_task_log(task_id, "Starting build_parquet") + _mark_run_status(task_id, parameters, task_type="build_parquet", status="running") try: pkl_archive_to_parquet = _import_catalog_io() if pkl_archive_to_parquet is None: + _mark_run_status( + task_id, parameters, task_type="build_parquet", status="failed", error_message="perception_catalog_io not available" + ) update_task_status(task_id, "failed", error_message="perception_catalog_io not available") return pkl_dir = parameters.get("pkl_dir") if not pkl_dir: + _mark_run_status(task_id, parameters, task_type="build_parquet", status="failed", error_message="Missing pkl_dir") update_task_status(task_id, "failed", error_message="Missing pkl_dir") return append_task_log(task_id, f"Building parquet from {pkl_dir}") + update_task_progress(task_id, message=f"Parquet: scanning pkl files in {pkl_dir}", pct=0) project_id = parameters.get("project_id") job_id = parameters.get("job_id") parquet_path = pkl_archive_to_parquet( pkl_dir, - on_progress=None, - on_skip=None, + on_progress=_parquet_progress_callback(task_id, pct_start=5, pct_end=95), + on_skip=lambda path, reason: append_task_log(task_id, f"Parquet skipped {path}: {reason}"), project_id=project_id, job_id=job_id, ) + update_task_progress(task_id, message="Parquet: writing output complete", pct=100) update_task_result_summary(task_id, {"job": "build_parquet", "output_path": parquet_path}) + _update_run_metadata( + task_id, + parameters, + task_type="build_parquet", + extra={ + "parquet": { + "enabled": True, + "path": parquet_path, + } + }, + ) append_task_log(task_id, f"Done. Output: {parquet_path}") + _mark_run_status(task_id, parameters, task_type="build_parquet", status="completed", result_path=parquet_path) update_task_status(task_id, "completed", result_path=parquet_path) except Exception as e: append_task_log(task_id, f"Failed: {e}") + _mark_run_status(task_id, parameters, task_type="build_parquet", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) raise @@ -144,15 +561,231 @@ def _progress_callback(task_id: str, message: str) -> None: update_task_progress(task_id, message=message) +def _is_failed_case_status(case_report: Dict[str, Any]) -> bool: + """Best-effort failure check for case report payloads.""" + result = case_report.get("result") or {} + status = (result.get("status") or case_report.get("status") or "").strip().lower() + return status in {"failed", "failure", "error", "timed_out", "timeout", "canceled", "cancelled", "aborted"} + + +def _summarize_suite_reports(suite_rows: Any, *, limit: int = 10) -> list[Dict[str, Any]]: + """Normalize suite rows into a compact summary suitable for task result_summary.""" + normalized = [] + for row in suite_rows or []: + normalized.append( + { + "suite_name": row.get("name", ""), + "total": int(row.get("all", 0) or 0), + "success": int(row.get("success", 0) or 0), + "failed": int(row.get("fail", 0) or 0), + "canceled": int(row.get("cancel", 0) or 0), + "simulation": row.get("simulation", ""), + "url": row.get("url", ""), + } + ) + normalized.sort(key=lambda item: (-item["failed"], item["suite_name"])) + return normalized[:limit] + + +def _suite_case_totals(suite_rows: Any) -> Dict[str, int]: + """Aggregate totals from full suite rows.""" + totals = {"total": 0, "success": 0, "failed": 0, "canceled": 0} + for row in suite_rows or []: + totals["total"] += int(row.get("all", 0) or 0) + totals["success"] += int(row.get("success", 0) or 0) + totals["failed"] += int(row.get("fail", 0) or 0) + totals["canceled"] += int(row.get("cancel", 0) or 0) + return totals + + +def _extract_failed_case_details(case_reports: Any, *, limit: int = 12) -> list[Dict[str, Any]]: + """Return a compact list of failed cases for UI/log display.""" + failed = [] + for report in case_reports or []: + if not _is_failed_case_status(report): + continue + failed.append( + { + "scenario_name": ((report.get("scenario") or {}).get("display_name", "")), + "suite_name": ((report.get("suite") or {}).get("display_name", "")), + "status": report.get("status", ""), + "fail_message": report.get("fail_message", ""), + "failure_cause_labels": report.get("failure_cause_labels", []), + "archive_log_id": (((report.get("logs") or {}).get("simulation_archive") or {}).get("id", "")), + "result_json_log_id": (((report.get("logs") or {}).get("simulation_result_json") or {}).get("id", "")), + } + ) + failed.sort(key=lambda item: (item["suite_name"], item["scenario_name"], item["fail_message"])) + return failed[:limit] + + +def _extract_git_target_from_report(report: Dict[str, Any]) -> str: + """Compact branch/tag label from evaluator report metadata.""" + source = ((report.get("event") or {}).get("source") or {}) + git_ref = str(source.get("git_ref") or "").strip() + if git_ref.startswith("refs/heads/"): + return git_ref[len("refs/heads/"):] + if git_ref.startswith("refs/tags/"): + return git_ref[len("refs/tags/"):] + return git_ref or str(source.get("git_sha") or "").strip()[:12] or "" + + +def _extract_job_title_from_report(report: Dict[str, Any]) -> str: + """Prefer evaluator description for display title, with a readable fallback.""" + description = str(report.get("description") or "").strip() + if description: + return description + started_like = report.get("started_at") or report.get("scheduled_at") or report.get("finished_at") + return f"no description ({started_like or 'unknown start'})" + + +def _extract_catalog_url_from_report(report: Dict[str, Any]) -> str: + """Best-effort catalog URL matching the recent evaluator jobs list.""" + catalog = report.get("catalog") or {} + direct_url = str( + catalog.get("web_url") + or catalog.get("url") + or catalog.get("catalog_url") + or "" + ).strip() + if direct_url: + return direct_url + project_id = str(report.get("project_id") or "").strip() + catalog_id = str(catalog.get("catalog_id") or catalog.get("id") or "").strip() + if project_id and catalog_id: + return f"https://evaluation.tier4.jp/evaluation/vehicle_catalogs/{catalog_id}?project_id={project_id}" + return "" + + +def _extract_source_metadata_from_report(report: Dict[str, Any]) -> Dict[str, str]: + """Best-effort source metadata for local run rendering without refetching.""" + source = ((report.get("event") or {}).get("source") or {}) + git_url = str(source.get("git_web_url") or source.get("git_url") or "").strip() + return { + "title": _extract_job_title_from_report(report), + "target": _extract_git_target_from_report(report), + "git_sha": str(source.get("git_sha") or "").strip(), + "git_ref_url": str(source.get("git_ref_url") or "").strip(), + "git_commit_url": str(source.get("git_commit_url") or "").strip(), + "source_url": git_url, + "source_repo_label": git_url.rstrip("/").split("/")[-1] if git_url else "", + } + + +def _build_evaluator_result_summary( + *, + job_id: str, + report_url: str, + evaluator_status: str, + final_report: Dict[str, Any], + suite_rows: Any = None, + failed_cases: Any = None, +) -> Dict[str, Any]: + """Build a compact evaluator summary that the task detail UI can render.""" + build = final_report.get("build") or {} + test = final_report.get("test") or {} + available = test.get("available_case_results") or test.get("case_results") or {} + case_totals = _suite_case_totals(suite_rows) + source_meta = _extract_source_metadata_from_report(final_report) + if not any(case_totals.values()): + case_totals = { + "total": int(available.get("total_count", 0) or 0), + "success": int(available.get("success_count", 0) or 0), + "failed": int(available.get("failure_count", 0) or 0), + "canceled": int(available.get("cancellation_count", 0) or 0), + } + return { + "evaluator_job_id": job_id, + "evaluator_report_url": report_url, + "evaluator_status": evaluator_status, + "evaluator_scheduled_by": final_report.get("scheduled_by", ""), + "evaluator_catalog_id": ((final_report.get("catalog") or {}).get("id") or ""), + "evaluator_catalog_name": ((final_report.get("catalog") or {}).get("display_name") or ""), + "evaluator_catalog_version_id": ((final_report.get("catalog") or {}).get("version_id") or ""), + "evaluator_catalog_url": _extract_catalog_url_from_report(final_report), + "evaluator_title": source_meta.get("title", ""), + "evaluator_target": source_meta.get("target", ""), + "evaluator_git_sha": source_meta.get("git_sha", ""), + "evaluator_git_ref_url": source_meta.get("git_ref_url", ""), + "evaluator_git_commit_url": source_meta.get("git_commit_url", ""), + "evaluator_source_url": source_meta.get("source_url", ""), + "evaluator_source_repo_label": source_meta.get("source_repo_label", ""), + "evaluator_build_status": build.get("status", ""), + "evaluator_test_status": test.get("status", ""), + "evaluator_fail_message": final_report.get("fail_message", ""), + "evaluator_case_totals": case_totals, + "evaluator_suites": _summarize_suite_reports(suite_rows), + "evaluator_failed_cases": _extract_failed_case_details(failed_cases), + } + + +def _fetch_evaluator_context( + *, + project_id: str, + job_id: str, + environment: str, +) -> Dict[str, Any]: + """Best-effort evaluator metadata for tasks that start from an existing evaluator job.""" + if not project_id or not job_id: + return {} + try: + from lib import evaluator_api + + os.environ["AUTH_PROFILE"] = environment or "default" + api = evaluator_api.EvaluationRunAPI() + report = api.get_job_status(project_id, job_id) + status = evaluator_api.extract_job_status(report) + build = report.get("build") or {} + test = report.get("test") or {} + available = test.get("available_case_results") or test.get("case_results") or {} + source_meta = _extract_source_metadata_from_report(report) + return { + "job_id": job_id, + "report_url": evaluator_api.get_job_report_url(project_id, job_id), + "status": status, + "scheduled_by": str(report.get("scheduled_by") or "").strip(), + "catalog_id": str(((report.get("catalog") or {}).get("id") or "")).strip(), + "catalog_name": str(((report.get("catalog") or {}).get("display_name") or "")).strip(), + "catalog_version_id": (report.get("catalog") or {}).get("version_id"), + "catalog_url": _extract_catalog_url_from_report(report), + "title": source_meta.get("title", ""), + "target": source_meta.get("target", ""), + "git_sha": source_meta.get("git_sha", ""), + "git_ref_url": source_meta.get("git_ref_url", ""), + "git_commit_url": source_meta.get("git_commit_url", ""), + "source_url": source_meta.get("source_url", ""), + "source_repo_label": source_meta.get("source_repo_label", ""), + "build_status": str(build.get("status") or "").strip(), + "test_status": str(test.get("status") or "").strip(), + "fail_message": str(report.get("fail_message") or "").strip(), + "case_totals": { + "total": int(available.get("total_count", 0) or 0), + "success": int(available.get("success_count", 0) or 0), + "failed": int(available.get("failure_count", 0) or 0), + "canceled": int(available.get("cancellation_count", 0) or 0), + }, + } + except Exception: + return {} + + def job_download_results(task_id: str, parameters: Dict[str, Any]) -> None: """Download job results (archives or result JSON) and extract/organize. Requires auth.""" update_task_status(task_id, "running") append_task_log(task_id, "Starting download_results") + _mark_run_status( + task_id, + parameters, + task_type="download_results", + status="running", + create_missing=True, + ) try: from lib import download_core # noqa: F401 output_path = parameters.get("output_path") project_id = parameters.get("project_id") job_id = parameters.get("job_id") + environment = str(parameters.get("environment") or "default").strip() or "default" suite_id = parameters.get("suite_id") suite_ids = parameters.get("suite_ids") # optional list download_type = parameters.get("download_type", "archives") # archives | result_json @@ -161,8 +794,25 @@ def job_download_results(task_id: str, parameters: Dict[str, Any]) -> None: large_file_mb = float(parameters.get("large_file_mb", 50.0)) keep_zip_files = parameters.get("keep_zip_files", False) if not all([output_path, project_id, job_id]): + _mark_run_status( + task_id, + parameters, + task_type="download_results", + status="failed", + error_message="Missing output_path, project_id, or job_id", + create_missing=True, + ) update_task_status(task_id, "failed", error_message="Missing output_path, project_id, or job_id") return + evaluator_context = _fetch_evaluator_context(project_id=project_id, job_id=job_id, environment=environment) + if evaluator_context: + _update_run_metadata( + task_id, + parameters, + task_type="download_results", + create_missing=True, + extra={"evaluator": evaluator_context}, + ) on_progress = lambda msg: _progress_callback(task_id, msg) on_warning = lambda msg: append_task_log(task_id, msg) failure_count, total_attempted, rows = download_core.run_download_results( @@ -189,22 +839,67 @@ def job_download_results(task_id: str, parameters: Dict[str, Any]) -> None: "rows": rows[:500], } update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="download_results", + create_missing=True, + extra={ + "download": { + "mode": "download_results", + "total": total_attempted, + "success": success_count, + "failed": failure_count, + "rows": rows[:100], + "download_type": download_type, + "phase": phase, + "skip_large_file": bool(skip_large_file), + "large_file_mb": large_file_mb, + "keep_zip_files": bool(keep_zip_files), + } + }, + ) append_task_log(task_id, "Download and extract completed") if success_count == 0 and failure_count > 0: err_msg = f"Download completed with {failure_count} failures. See task log for details." + _mark_run_status( + task_id, + parameters, + task_type="download_results", + status="failed", + result_path=output_path, + error_message=err_msg, + ) update_task_status(task_id, "failed", result_path=output_path, error_message=err_msg) else: + _mark_run_status( + task_id, + parameters, + task_type="download_results", + status="completed", + result_path=output_path, + ) update_task_status(task_id, "completed", result_path=output_path) except ImportError: + _mark_run_status( + task_id, + parameters, + task_type="download_results", + status="failed", + error_message="Download worker not available: lib.download_core not implemented", + create_missing=True, + ) update_task_status( task_id, "failed", error_message="Download worker not available: lib.download_core not implemented", ) except NotImplementedError as e: + _mark_run_status(task_id, parameters, task_type="download_results", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) except Exception as e: append_task_log(task_id, f"Failed: {e}") + _mark_run_status(task_id, parameters, task_type="download_results", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) raise @@ -213,19 +908,44 @@ def job_download_scenarios(task_id: str, parameters: Dict[str, Any]) -> None: """Download scenarios from job to output_dir. Requires auth.""" update_task_status(task_id, "running") append_task_log(task_id, "Starting download_scenarios") + _mark_run_status( + task_id, + parameters, + task_type="download_scenarios", + status="running", + create_missing=True, + ) try: from lib import download_core # noqa: F401 output_dir = parameters.get("output_dir") or parameters.get("output_path") project_id = parameters.get("project_id") job_id = parameters.get("job_id") + environment = str(parameters.get("environment") or "default").strip() or "default" suite_id = parameters.get("suite_id") suite_ids = parameters.get("suite_ids") overwrite = parameters.get("overwrite", False) scenario_name_filter = parameters.get("scenario_name_filter") selected_ids = parameters.get("selected_ids") if not all([output_dir, project_id, job_id]): + _mark_run_status( + task_id, + parameters, + task_type="download_scenarios", + status="failed", + error_message="Missing output_dir, project_id, or job_id", + create_missing=True, + ) update_task_status(task_id, "failed", error_message="Missing output_dir, project_id, or job_id") return + evaluator_context = _fetch_evaluator_context(project_id=project_id, job_id=job_id, environment=environment) + if evaluator_context: + _update_run_metadata( + task_id, + parameters, + task_type="download_scenarios", + create_missing=True, + extra={"evaluator": evaluator_context}, + ) on_progress = lambda msg: _progress_callback(task_id, msg) on_warning = lambda msg: append_task_log(task_id, msg) failure_count, total_attempted, rows = download_core.run_download_scenarios( @@ -250,22 +970,1573 @@ def job_download_scenarios(task_id: str, parameters: Dict[str, Any]) -> None: "rows": rows[:500], } update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="download_scenarios", + create_missing=True, + extra={ + "scenario_download": { + "total": total_attempted, + "success": success_count, + "failed": failure_count, + "overwrite": bool(overwrite), + "scenario_name_filter": str(scenario_name_filter or "").strip(), + "selected_ids": list(selected_ids or []), + "rows": rows[:100], + } + }, + ) append_task_log(task_id, "Download scenarios completed") if failure_count > 0: err_msg = f"Download completed with {failure_count} failures. See task log for details." + _mark_run_status( + task_id, + parameters, + task_type="download_scenarios", + status="failed", + result_path=output_dir, + error_message=err_msg, + ) update_task_status(task_id, "failed", result_path=output_dir, error_message=err_msg) else: + _mark_run_status( + task_id, + parameters, + task_type="download_scenarios", + status="completed", + result_path=output_dir, + ) update_task_status(task_id, "completed", result_path=output_dir) except ImportError: + _mark_run_status( + task_id, + parameters, + task_type="download_scenarios", + status="failed", + error_message="Download worker not available: lib.download_core not implemented", + create_missing=True, + ) update_task_status( task_id, "failed", error_message="Download worker not available: lib.download_core not implemented", ) except NotImplementedError as e: + _mark_run_status(task_id, parameters, task_type="download_scenarios", status="failed", error_message=str(e)) + update_task_status(task_id, "failed", error_message=str(e)) + except Exception as e: + append_task_log(task_id, f"Failed: {e}") + _mark_run_status(task_id, parameters, task_type="download_scenarios", status="failed", error_message=str(e)) + update_task_status(task_id, "failed", error_message=str(e)) + raise + + +def job_download_and_eval(task_id: str, parameters: Dict[str, Any]) -> None: + """Download results, then run eval and parquet generation. Stops on download failure.""" + update_task_status(task_id, "running") + append_task_log(task_id, "Starting download_and_eval combined workflow") + _mark_run_status( + task_id, + parameters, + task_type="download_and_eval", + status="running", + create_missing=True, + ) + try: + from lib import download_core + output_path = parameters.get("output_path") + project_id = parameters.get("project_id") + job_id = parameters.get("job_id") + environment = str(parameters.get("environment") or "default").strip() or "default" + suite_id = parameters.get("suite_id") + suite_ids = parameters.get("suite_ids") + download_type = parameters.get("download_type", "archives") + phase = parameters.get("phase", "perception.object_recognition.tracking.objects") + skip_large_file = parameters.get("skip_large_file", False) + large_file_mb = float(parameters.get("large_file_mb", 50.0)) + keep_zip_files = parameters.get("keep_zip_files", False) + run_eval = parameters.get("run_eval", True) + generate_parquet = parameters.get("generate_parquet", True) + eval_recursive = parameters.get("eval_recursive", True) + eval_overwrite = parameters.get("eval_overwrite", False) + + if not all([output_path, project_id, job_id]): + _mark_run_status( + task_id, + parameters, + task_type="download_and_eval", + status="failed", + error_message="Missing output_path, project_id, or job_id", + create_missing=True, + ) + update_task_status(task_id, "failed", error_message="Missing output_path, project_id, or job_id") + return + evaluator_context = _fetch_evaluator_context(project_id=project_id, job_id=job_id, environment=environment) + if evaluator_context: + _update_run_metadata( + task_id, + parameters, + task_type="download_and_eval", + create_missing=True, + extra={"evaluator": evaluator_context}, + ) + + def on_progress(msg: str) -> None: + append_task_log(task_id, msg) + match = re.search(r"(\d+)\s*/\s*(\d+)", msg) + pct = None + if match: + n, m = int(match.group(1)), max(1, int(match.group(2))) + ratio = n / m + if msg.startswith("Eval:"): + pct = 60.0 + ratio * 25.0 + elif msg.startswith("Parquet:"): + pct = 85.0 + ratio * 13.0 + elif msg.startswith("Downloading"): + pct = ratio * 60.0 + if pct is None: + if msg.startswith("Download complete"): + pct = 60.0 + elif msg.startswith("Generating parquet"): + pct = 85.0 + if pct is None: + update_task_progress(task_id, message=msg) + else: + update_task_progress(task_id, message=msg, pct=pct) + + on_warning = lambda msg: append_task_log(task_id, msg) + + result = download_core.run_download_and_eval( + project_id=project_id, + job_id=job_id, + suite_id=suite_id, + output_path=output_path, + download_type=download_type, + phase=phase, + skip_large_file=skip_large_file, + large_file_mb=large_file_mb, + keep_zip_files=keep_zip_files, + suite_ids=suite_ids, + run_eval=run_eval, + generate_parquet=generate_parquet, + eval_recursive=eval_recursive, + eval_overwrite=eval_overwrite, + eval_workers=_eval_worker_count(parameters, 10_000), + on_progress=on_progress, + on_warning=on_warning, + ) + + # Build result summary + summary = { + "job": "download_and_eval", + "download_success": result.get("download_success", False), + "download_summary": result.get("download_summary", {}), + "eval_summary": result.get("eval_summary", {}), + "parquet_path": result.get("parquet_path", ""), + "errors": result.get("errors", []), + } + update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="download_and_eval", + create_missing=True, + extra={ + "download": { + "mode": "download_and_eval", + **(result.get("download_summary", {}) or {}), + "download_type": download_type, + "phase": phase, + "skip_large_file": bool(skip_large_file), + "large_file_mb": large_file_mb, + "keep_zip_files": bool(keep_zip_files), + }, + "evaluation": { + **(result.get("eval_summary", {}) or {}), + "enabled": bool(run_eval), + "recursive": bool(eval_recursive), + "overwrite": bool(eval_overwrite), + }, + "parquet": { + "enabled": bool(generate_parquet), + "path": result.get("parquet_path", ""), + }, + "errors": list(result.get("errors", []) or []), + }, + ) + + if not result.get("download_success"): + err_msg = result.get("errors", ["Download failed"])[0] + append_task_log(task_id, f"Stopped: {err_msg}") + _mark_run_status( + task_id, + parameters, + task_type="download_and_eval", + status="failed", + result_path=output_path, + error_message=err_msg, + ) + update_task_status(task_id, "failed", result_path=output_path, error_message=err_msg) + elif result.get("errors"): + # Partial success with some errors + errs = "; ".join(result["errors"][:5]) + append_task_log(task_id, f"Completed with errors: {errs}") + _mark_run_status( + task_id, + parameters, + task_type="download_and_eval", + status="completed", + result_path=output_path, + error_message=errs, + ) + update_task_status(task_id, "completed", result_path=output_path) + else: + append_task_log(task_id, "Download and eval completed successfully") + _mark_run_status( + task_id, + parameters, + task_type="download_and_eval", + status="completed", + result_path=output_path, + ) + update_task_status(task_id, "completed", result_path=output_path) + + except Exception as e: + append_task_log(task_id, f"Failed: {e}") + _mark_run_status(task_id, parameters, task_type="download_and_eval", status="failed", error_message=str(e)) + update_task_status(task_id, "failed", error_message=str(e)) + raise + + +def _write_release_metadata_file(path: Path, metadata: Dict[str, Any]) -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as fh: + yaml.safe_dump(metadata, fh, allow_unicode=True, sort_keys=False) + return path + + +def _build_devops_trend_summary_from_suites(rows: list[dict[str, Any]]) -> Dict[str, Any]: + suite_results: dict[str, dict[str, int]] = {} + for row in rows or []: + suite_name = str(row.get("name") or row.get("suite_name") or row.get("simulation") or "suite").strip() + total = int(row.get("all", 0) or row.get("total", 0) or 0) + passed = int(row.get("success", 0) or row.get("passed", 0) or 0) + if total <= 0: + failed = int(row.get("fail", 0) or row.get("failed", 0) or 0) + canceled = int(row.get("cancel", 0) or row.get("canceled", 0) or 0) + total = passed + failed + canceled + if total <= 0: + continue + suite_results[suite_name] = {"passed": passed, "total": total} + if not suite_results: + return {"DevOps": {}} + + try: + from perception_catalog_analyzer.path import DEVOPS_MAPPING_PATH + + with Path(DEVOPS_MAPPING_PATH).open("r", encoding="utf-8") as fh: + category_mapping = yaml.safe_load(fh) or {} + except Exception: + category_mapping = {} + + if not isinstance(category_mapping, dict) or not category_mapping: + return {"DevOps": {"Suite pass rate": suite_results}} + + mapped: Dict[str, Any] = {} + matched_suites: set[str] = set() + for major_category, mid_categories in category_mapping.items(): + if not isinstance(mid_categories, dict): + continue + major_payload: Dict[str, Any] = {} + for mid_category, sub_categories in mid_categories.items(): + if not isinstance(sub_categories, dict): + continue + mid_payload: Dict[str, Any] = {} + for sub_category, suite_names in sub_categories.items(): + if not isinstance(suite_names, list): + continue + passed = 0 + total = 0 + for suite_name in suite_names: + result = suite_results.get(str(suite_name)) + if not result: + continue + matched_suites.add(str(suite_name)) + passed += int(result.get("passed", 0) or 0) + total += int(result.get("total", 0) or 0) + mid_payload[str(sub_category)] = {"passed": passed, "total": total} + if mid_payload: + major_payload[str(mid_category)] = mid_payload + if major_payload: + mapped[str(major_category)] = major_payload + + unmatched = { + suite_name: result + for suite_name, result in suite_results.items() + if suite_name not in matched_suites + } + if unmatched: + mapped.setdefault("その他", {})["未分類"] = unmatched + + return {"DevOps": mapped} + + +def _write_devops_trend_summary(path: Path, rows: list[dict[str, Any]]) -> Path | None: + summary_payload = _build_devops_trend_summary_from_suites(rows) + if not summary_payload.get("DevOps"): + return None + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as fh: + json.dump(summary_payload, fh, ensure_ascii=False, indent=2) + return path + + +def _suite_rows_from_existing_devops_summary(path: Path) -> list[dict[str, Any]]: + if not path.exists(): + return [] + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except Exception: + return [] + devops = payload.get("DevOps") if isinstance(payload, dict) else {} + if not isinstance(devops, dict): + return [] + suite_pass_rate = devops.get("Suite pass rate") + if not isinstance(suite_pass_rate, dict): + return [] + rows: list[dict[str, Any]] = [] + for suite_name, result in suite_pass_rate.items(): + if not isinstance(result, dict): + continue + rows.append( + { + "suite_name": str(suite_name), + "success": int(result.get("passed", 0) or 0), + "total": int(result.get("total", 0) or 0), + } + ) + return rows + + +def _has_release_download_artifacts(path: Path) -> bool: + return any(path.rglob("scene_result.pkl")) or any(path.rglob("*.pkl.z")) + + +def _find_release_parquet(path: Path) -> Path | None: + current = path / "current.parquet" + if current.exists(): + return current + for parquet in sorted(path.glob("*.parquet"), key=lambda p: p.name.lower()): + return parquet + return None + + +def _build_release_analysis_artifacts( + *, + task_id: str, + project_id: str, + job_id: str, + role: str, + output_path: Path, + phase: str, + run_eval: bool = False, + skip_large_file: bool = _RELEASE_SKIP_LARGE_FILE, + large_file_mb: float = _RELEASE_LARGE_FILE_MB, + progress_start: float = 48.0, + progress_end: float = 78.0, +) -> Dict[str, Any]: + """Create the normal app analysis files for a release job.""" + from lib import download_core + + eval_summary = _import_eval_summary() + pkl_archive_to_parquet = _import_catalog_io() + output_path.mkdir(parents=True, exist_ok=True) + result: Dict[str, Any] = { + "path": str(output_path), + "download": {}, + "eval": {}, + "parquet_path": "", + "warnings": [], + } + effective_skip_large_file = _RELEASE_SKIP_LARGE_FILE or bool(skip_large_file) + effective_large_file_mb = float(large_file_mb or _RELEASE_LARGE_FILE_MB) + + progress_span = max(0.0, progress_end - progress_start) + download_end = progress_start + progress_span * 0.55 + eval_end = progress_start + progress_span * 0.90 + existing_parquet = _find_release_parquet(output_path) + + def _on_progress(msg: str) -> None: + append_task_log(task_id, f"{role}: {msg}") + progress_msg = f"{role}: {msg}" + pct = progress_start + match = re.search(r"Downloading\s+(\d+)\s*/\s*(\d+)", msg) + if match: + current = int(match.group(1)) + total = max(1, int(match.group(2))) + pct = progress_start + ((current - 1) / total) * max(0.0, download_end - progress_start) + elif "Extracting" in msg or "Organizing" in msg: + pct = download_end + update_task_progress(task_id, message=progress_msg, pct=min(download_end, pct)) + + def _on_warning(msg: str) -> None: + result["warnings"].append(msg) + append_task_log(task_id, f"WARNING: {role}: {msg}") + + if existing_parquet or _has_release_download_artifacts(output_path): + append_task_log(task_id, f"{role}: using existing downloaded artifacts in {output_path}") + update_task_progress(task_id, message=f"{role}: using existing downloaded artifacts", pct=download_end) + failure_count = 0 + total_attempted = 0 + success_count = 0 + rows: list[dict[str, Any]] = [] + else: + if not job_id: + raise RuntimeError(f"{role}: no local artifacts found and no evaluator job id is available for download.") + update_task_progress(task_id, message=f"{role}: finding downloadable case logs", pct=progress_start) + failure_count, total_attempted, rows = download_core.run_download_results( + project_id=project_id, + job_id=job_id, + suite_id=None, + output_path=str(output_path), + download_type="archives", + phase=phase, + skip_large_file=effective_skip_large_file, + large_file_mb=effective_large_file_mb, + keep_zip_files=False, + suite_ids=None, + on_progress=_on_progress, + on_warning=_on_warning, + ) + success_count = total_attempted - failure_count + if success_count <= 0: + raise RuntimeError(f"{role}: download produced no successful case artifacts.") + result["download"] = { + "total": total_attempted, + "success": success_count, + "failed": failure_count, + "skip_large_file": effective_skip_large_file, + "large_file_mb": effective_large_file_mb, + "rows": rows[:100], + } + + if run_eval and eval_summary and not existing_parquet: + target_dirs = eval_summary.find_eval_result_dirs(str(output_path), recursive=True) + total = len(target_dirs) + summary_csv = output_path / "Summary.csv" + score_csv = output_path / "Score.csv" + if target_dirs and summary_csv.exists() and score_csv.exists(): + append_task_log(task_id, f"{role}: Summary.csv / Score.csv already exist; skipping eval") + update_task_progress(task_id, message=f"{role}: existing Summary.csv / Score.csv found", pct=eval_end) + statuses = [] + result["eval"] = { + "directories_processed": total, + "success": 0, + "failed": 0, + "skipped": total, + "summary_path": str(summary_csv), + "summary_rows": 0, + "score_rows": 0, + } + elif target_dirs: + statuses = _run_eval_result_dirs( + task_id=task_id, + eval_summary=eval_summary, + target_dirs=target_dirs, + overwrite=False, + eval_workers=_eval_worker_count({}, total), + pct_start=download_end, + pct_end=eval_end, + label=f"{role}: eval_result", + ) + else: + update_task_progress(task_id, message=f"{role}: no eval_result directories found", pct=eval_end) + statuses = [] + if target_dirs and not result["eval"]: + update_task_progress(task_id, message=f"{role}: generating Summary.csv / Score.csv", pct=eval_end) + csv_info = eval_summary.generate_summary_and_score_csv(str(output_path)) + result["eval"] = { + "directories_processed": len(target_dirs), + "success": sum(1 for item in statuses if item.get("status") == "success"), + "failed": sum(1 for item in statuses if item.get("status") == "failed"), + "skipped": sum(1 for item in statuses if item.get("status") == "skipped"), + "summary_path": csv_info.get("summary_path", ""), + "summary_rows": csv_info.get("summary_rows", 0), + "score_rows": csv_info.get("score_rows", 0), + } + elif not result["eval"]: + result["eval"] = { + "directories_processed": 0, + "success": 0, + "failed": 0, + "skipped": 0, + } + elif not run_eval: + append_task_log(task_id, f"{role}: skipping eval; parquet is sufficient for release PDF generation") + result["eval"] = {"enabled": False, "reason": "release_pdf_uses_parquet"} + elif existing_parquet: + append_task_log(task_id, f"{role}: skipping eval because parquet already exists") + result["eval"] = {"enabled": False, "reason": "existing_parquet"} + + existing_parquet = _find_release_parquet(output_path) + if existing_parquet: + append_task_log(task_id, f"{role}: existing parquet found: {existing_parquet}") + result["parquet_path"] = str(existing_parquet) + update_task_progress(task_id, message=f"{role}: existing parquet found", pct=progress_end) + elif pkl_archive_to_parquet: + try: + update_task_progress(task_id, message=f"{role}: generating parquet", pct=eval_end) + result["parquet_path"] = pkl_archive_to_parquet( + str(output_path), + on_progress=_parquet_progress_callback( + task_id, + prefix=f"{role}: parquet", + pct_start=eval_end, + pct_end=99, + ), + on_skip=lambda path, reason: append_task_log( + task_id, + f"WARNING: {role}: parquet skipped {path}: {reason}", + ), + project_id=project_id, + job_id=job_id, + ) or "" + update_task_progress(task_id, message=f"{role}: parquet generated", pct=99) + except Exception as exc: + warning = f"Parquet generation failed: {exc}" + result["warnings"].append(warning) + append_task_log(task_id, f"WARNING: {role}: {warning}") + + append_task_log( + task_id, + ( + f"{role}: analysis artifacts ready at {output_path} " + f"({success_count}/{total_attempted} downloads)" + ), + ) + return result + + +def job_run_release_specsheet_workflow(task_id: str, parameters: Dict[str, Any]) -> None: + """Schedule the standard release evaluator jobs, process them as app-native runs, then build a release specsheet.""" + update_task_status(task_id, "running") + append_task_log(task_id, "Starting release specsheet workflow") + _mark_run_status( + task_id, + parameters, + task_type="run_release_specsheet_workflow", + status="running", + create_missing=True, + ) + try: + from lib import evaluator_api + from lib.specsheet_report import ( + DEFAULT_SPECSHEET_LABELS, + DEFAULT_SPECSHEET_TOPIC, + generate_specsheet_pdf, + resolve_specsheet_topic_name, + ) + + project_id = str(parameters.get("project_id") or "").strip() + target_name = str(parameters.get("target_name") or "").strip() + output_path = str(parameters.get("output_path") or "").strip() + environment = str(parameters.get("environment") or "default").strip() or "default" + is_tag = bool(parameters.get("is_tag", False)) + metadata = parameters.get("trend_metadata") if isinstance(parameters.get("trend_metadata"), dict) else {} + version = str(parameters.get("version") or metadata.get("pilot_auto_version") or "").strip() + topic = str(parameters.get("topic") or metadata.get("topic_name") or DEFAULT_SPECSHEET_TOPIC).strip() + description = str(parameters.get("description") or target_name or "").strip() + poll_interval = float(parameters.get("poll_interval", 60.0)) + max_wait_seconds = float(parameters.get("max_wait_seconds", 3600.0 * 24 * 7)) + analysis_phase = str( + parameters.get("analysis_phase") + or "perception.object_recognition.tracking.objects" + ).strip() + skip_large_file = _RELEASE_SKIP_LARGE_FILE + large_file_mb = float(parameters.get("large_file_mb") or _RELEASE_LARGE_FILE_MB) + labels = parameters.get("labels") or DEFAULT_SPECSHEET_LABELS + labels = [str(label).strip() for label in labels if str(label).strip()] + if not labels: + labels = list(DEFAULT_SPECSHEET_LABELS) + + if not project_id or not target_name or not output_path or not version: + raise ValueError("Missing project_id, target_name, output_path, or Pilot.Auto version.") + if "trend" not in [str(tag).strip() for tag in metadata.get("tags", [])]: + raise ValueError("Release metadata must include tags: [trend].") + + release_root = Path(output_path) + release_root.mkdir(parents=True, exist_ok=True) + _write_release_metadata_file(release_root / "metadata.yaml", metadata) + performance_path = release_root / "performance" + devops_path = release_root / "devops" + role_paths = { + "performance": performance_path, + "devops": devops_path, + "planning_test": release_root / "planning_test", + } + os.environ["AUTH_PROFILE"] = environment + os.environ["EVALUATOR_ENVIRONMENT"] = environment + + api = evaluator_api.EvaluationRunAPI() + optional_catalog_enabled = bool(parameters.get("optional_catalog_enabled", False)) + optional_catalog_id = str( + parameters.get("optional_catalog_id") or _RELEASE_OPTIONAL_CATALOG_ID + ).strip() + optional_integration_id = str(parameters.get("optional_integration_id") or "").strip() + if optional_catalog_enabled and optional_catalog_id and not optional_integration_id: + append_task_log(task_id, f"Resolving Planning Test catalog integration: {optional_catalog_id}") + optional_integration_id = _resolve_active_integration_id(api, project_id, optional_catalog_id) + jobs = [ + { + "role": "performance", + "label": "Performance Test", + "catalog_id": str(parameters.get("performance_catalog_id") or _RELEASE_PERFORMANCE_CATALOG_ID), + "integration_id": str(parameters.get("performance_integration_id") or _RELEASE_PERFORMANCE_INTEGRATION_ID), + "job_id": str(parameters.get("performance_job_id") or "").strip(), + }, + { + "role": "devops", + "label": "Devops Test", + "catalog_id": str(parameters.get("devops_catalog_id") or _RELEASE_DEVOPS_CATALOG_ID), + "integration_id": str(parameters.get("devops_integration_id") or _RELEASE_DEVOPS_INTEGRATION_ID), + "job_id": str(parameters.get("devops_job_id") or "").strip(), + }, + ] + if optional_catalog_enabled: + jobs.append( + { + "role": "planning_test", + "label": "Planning Test", + "catalog_id": optional_catalog_id, + "integration_id": optional_integration_id, + "job_id": str(parameters.get("optional_job_id") or "").strip(), + } + ) + summary: Dict[str, Any] = { + "job": "run_release_specsheet_workflow", + "release_root": str(release_root), + "version": version, + "topic": topic, + "evaluator_jobs": {}, + "analysis_artifacts": {}, + "specsheet_pdf": "", + } + update_task_result_summary(task_id, summary) + update_task_progress(task_id, message="Preparing release evaluator jobs", pct=2) + + for item in jobs: + schedule_description = f"{description} | {item['label']}" + item["description"] = schedule_description + role = str(item["role"]) + local_path = role_paths[role] + local_ready = _find_release_parquet(local_path) is not None or _has_release_download_artifacts(local_path) + item["local_artifacts_ready"] = local_ready + job_id = str(item.get("job_id") or "").strip() + if job_id: + append_task_log(task_id, f"Using existing {item['label']}: {job_id}") + status = "existing" + elif local_ready: + append_task_log(task_id, f"Using existing local artifacts for {item['label']}: {local_path}") + status = "local_artifacts" + else: + append_task_log(task_id, f"Scheduling {item['label']}: catalog={item['catalog_id']}") + result = api.schedule_job( + project_id=project_id, + catalog_id=item["catalog_id"], + integration_id=item["integration_id"], + target_name=target_name, + suite_ids=None, + max_retries=0, + description=schedule_description, + clean_build=True, + debug=False, + release=False, + record_caret=False, + log_expiration_time_in_days=10.0, + is_tag=is_tag, + ) + job_id = str(result.get("job_id") or "").strip() + if not job_id: + raise RuntimeError(f"No job_id returned for {item['label']}.") + item["job_id"] = job_id + status = "scheduled" + report_url = evaluator_api.get_job_report_url(project_id, job_id) if job_id else "" + summary["evaluator_jobs"][item["role"]] = { + "job_id": job_id, + "report_url": report_url, + "catalog_id": item["catalog_id"], + "integration_id": item["integration_id"], + "status": status, + "description": schedule_description, + } + if status == "scheduled": + append_task_log(task_id, f"Scheduled {item['label']}: {job_id}") + update_task_result_summary(task_id, summary) + + wait_span = 40.0 / max(1, len(jobs)) + for idx, item in enumerate(jobs, start=1): + job_id = str(item["job_id"]) + label = str(item["label"]) + base_pct = 5 + (idx - 1) * wait_span + if not job_id and item.get("local_artifacts_ready"): + append_task_log(task_id, f"Skipping evaluator wait for {label}; local artifacts already exist.") + summary["evaluator_jobs"][item["role"]]["status"] = "local_artifacts" + update_task_progress(task_id, message=f"{label}: using local artifacts", pct=base_pct + wait_span - 2.0) + update_task_result_summary(task_id, summary) + continue + + def _on_check(status: str, elapsed: float, *, role: str = str(item["role"]), pct_base: float = base_pct) -> None: + pct = min(pct_base + (elapsed / max_wait_seconds) * max(2.0, wait_span - 2.0), pct_base + wait_span - 2.0) + summary["evaluator_jobs"][role]["status"] = status + update_task_progress( + task_id, + message=f"{label}: {status} ({elapsed / 3600:.1f}h elapsed)", + pct=pct, + ) + update_task_result_summary(task_id, summary) + + append_task_log(task_id, f"Waiting for {label}: {job_id}") + final_report = api.wait_for_job_completion( + project_id=project_id, + job_id=job_id, + poll_interval=poll_interval, + max_wait_seconds=max_wait_seconds, + on_check=_on_check, + ) + status = evaluator_api.extract_job_status(final_report) + summary["evaluator_jobs"][item["role"]]["status"] = status + append_task_log(task_id, f"{label} completed with status: {status}") + try: + suite_rows = api.get_suite_summary(project_id, job_id, use_available_case_results=True) + except Exception as exc: + append_task_log(task_id, f"WARNING: Could not fetch suite summary for {label}: {exc}") + suite_rows = [] + item["suite_rows"] = suite_rows + summary["evaluator_jobs"][item["role"]]["suite_count"] = len(suite_rows) + update_task_result_summary(task_id, summary) + + update_task_progress(task_id, message="Building normal CSV/parquet analysis artifacts", pct=48) + artifact_span = 30.0 / max(1, len(jobs)) + for artifact_idx, item in enumerate(jobs): + role = str(item["role"]) + analysis_path = role_paths[role] + artifact_summary = _build_release_analysis_artifacts( + task_id=task_id, + project_id=project_id, + job_id=str(item["job_id"]), + role=role, + output_path=analysis_path, + phase=analysis_phase, + run_eval=bool(parameters.get("run_eval", False)), + skip_large_file=skip_large_file, + large_file_mb=large_file_mb, + progress_start=48 + (artifact_span * artifact_idx), + progress_end=48 + (artifact_span * (artifact_idx + 1)), + ) + summary["analysis_artifacts"][role] = artifact_summary + update_task_result_summary(task_id, summary) + + child_params = { + **parameters, + "output_path": str(analysis_path), + "catalog_id": item["catalog_id"], + "integration_id": item["integration_id"], + "job_id": item["job_id"], + "download_type": "archives", + "phase": analysis_phase, + "skip_large_file": skip_large_file, + "large_file_mb": large_file_mb, + "run_eval": bool(parameters.get("run_eval", False)), + "generate_parquet": True, + "eval_recursive": bool(parameters.get("run_eval", False)), + } + _mark_run_status( + task_id, + child_params, + task_type="run_release_specsheet_workflow", + status="completed", + result_path=str(analysis_path), + create_missing=True, + extra={ + "release_specsheet": { + "root": str(release_root), + "role": role, + "metadata": metadata, + }, + "evaluator": { + "job_id": str(item["job_id"]), + "report_url": summary["evaluator_jobs"][role].get("report_url", ""), + "status": summary["evaluator_jobs"][role].get("status", ""), + "catalog_id": item["catalog_id"], + "integration_id": item["integration_id"], + "target_name": target_name, + "description": str(item.get("description") or ""), + "title": str(item.get("description") or ""), + }, + "download": { + **artifact_summary.get("download", {}), + "mode": "release_specsheet", + "download_type": "archives", + "phase": analysis_phase, + }, + "evaluation": { + **artifact_summary.get("eval", {}), + "enabled": bool(parameters.get("run_eval", False)), + "recursive": bool(parameters.get("run_eval", False)), + }, + "parquet": { + "enabled": True, + "path": artifact_summary.get("parquet_path", ""), + }, + }, + ) + + detected_topic, detected_topics = resolve_specsheet_topic_name( + performance_path, + topic, + fallback_topic=DEFAULT_SPECSHEET_TOPIC, + ) + if detected_topic != topic: + append_task_log( + task_id, + ( + f"Using detected specsheet topic {detected_topic} instead of requested topic {topic} " + f"(detected: {', '.join(detected_topics) if detected_topics else 'none'})" + ), + ) + topic = detected_topic + summary["topic"] = topic + summary["detected_topics"] = detected_topics + update_task_result_summary(task_id, summary) + + update_task_progress(task_id, message="Writing release trend summaries", pct=78) + write_trend_metadata(devops_path, metadata) + devops_job = next(item for item in jobs if item["role"] == "devops") + devops_summary_target = devops_path / "resources" / "summary.json" + devops_suite_rows = list(devops_job.get("suite_rows") or []) + if not devops_suite_rows: + existing_suite_rows = _suite_rows_from_existing_devops_summary(devops_summary_target) + if existing_suite_rows: + append_task_log(task_id, "Rebuilding DevOps trend summary from existing suite pass-rate rows.") + devops_suite_rows = existing_suite_rows + devops_summary_path = _write_devops_trend_summary(devops_summary_target, devops_suite_rows) + if devops_summary_path is None and devops_summary_target.exists(): + devops_summary_path = devops_summary_target + append_task_log(task_id, f"Using existing DevOps trend summary: {devops_summary_path}") + if devops_summary_path is None: + append_task_log(task_id, "WARNING: DevOps trend summary had no suite pass-rate rows.") + else: + append_task_log(task_id, f"DevOps trend summary written: {devops_summary_path}") + + update_task_progress(task_id, message="Generating app-native release specsheet", pct=82) + specsheet_pdf, generated = generate_specsheet_pdf( + performance_path, + project_id=project_id, + version=version, + labels=labels, + topic_name=topic, + include_trend=True, + trend_metadata=metadata, + force=bool(parameters.get("overwrite", True)), + progress_callback=lambda msg: append_task_log(task_id, f"specsheet: {msg}"), + ) + summary["specsheet_pdf"] = str(specsheet_pdf) + summary["specsheet_generated"] = bool(generated) + + update_task_progress(task_id, message="Release specsheet ready", pct=100) + update_task_result_summary(task_id, summary) + _mark_run_status( + task_id, + parameters, + task_type="run_release_specsheet_workflow", + status="completed", + result_path=str(specsheet_pdf), + extra={ + "release_specsheet": { + "root": str(release_root), + "specsheet_pdf": str(specsheet_pdf), + "evaluator_jobs": summary["evaluator_jobs"], + "analysis_artifacts": summary["analysis_artifacts"], + "metadata": metadata, + } + }, + ) + append_task_log(task_id, f"Release specsheet PDF ready: {specsheet_pdf}") + update_task_status(task_id, "completed", result_path=str(specsheet_pdf)) + except Exception as e: + append_task_log(task_id, f"Failed: {e}") + _mark_run_status( + task_id, + parameters, + task_type="run_release_specsheet_workflow", + status="failed", + error_message=str(e), + create_missing=True, + ) update_task_status(task_id, "failed", error_message=str(e)) + raise + + +def job_run_evaluator_and_process(task_id: str, parameters: Dict[str, Any]) -> None: + """ + Full combined workflow: Run Evaluator + Download + Eval + Parquet. + + Steps: + 1. Schedule evaluator job (get job_id) + 2. Poll until evaluator completes + 3. Download results + 4. Run eval + 5. Generate parquet + """ + update_task_status(task_id, "running") + append_task_log(task_id, "Starting run_evaluator_and_process workflow") + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="running", + create_missing=True, + ) + + try: + from lib import evaluator_api + from lib import download_core + + # Import eval_summary + eval_summary = _import_eval_summary() + pkl_archive_to_parquet = _import_catalog_io() + + # Extract parameters + project_id = parameters.get("project_id") + catalog_id = parameters.get("catalog_id") + integration_id = parameters.get("integration_id") + source_job_id = parameters.get("source_job_id") + suite_ids = parameters.get("suite_ids") + target_name = parameters.get("target_name") # branch name or tag + description = parameters.get("description", "no description") + output_path = parameters.get("output_path") + trend_metadata = parameters.get("trend_metadata") if isinstance(parameters.get("trend_metadata"), dict) else None + trend_role = str(parameters.get("trend_role") or "").strip() + + def _write_devops_trend_summary_from_suites(rows: list[dict[str, Any]]) -> None: + if not output_path: + return + summary_payload: Dict[str, Any] = {"DevOps": {"Suite pass rate": {}}} + for row in rows or []: + suite_name = str(row.get("name") or row.get("suite_name") or row.get("simulation") or "suite").strip() + total = int(row.get("all", 0) or row.get("total", 0) or 0) + passed = int(row.get("success", 0) or row.get("passed", 0) or 0) + if total <= 0: + failed = int(row.get("fail", 0) or row.get("failed", 0) or 0) + canceled = int(row.get("cancel", 0) or row.get("canceled", 0) or 0) + total = passed + failed + canceled + if total <= 0: + continue + summary_payload["DevOps"]["Suite pass rate"][suite_name] = { + "passed": passed, + "total": total, + } + if not summary_payload["DevOps"]["Suite pass rate"]: + return + resource_dir = Path(output_path) / "resources" + resource_dir.mkdir(parents=True, exist_ok=True) + with (resource_dir / "summary.json").open("w", encoding="utf-8") as fh: + json.dump(summary_payload, fh, ensure_ascii=False, indent=2) + + # Eval options + run_eval = parameters.get("run_eval", True) + generate_parquet = parameters.get("generate_parquet", True) + eval_recursive = parameters.get("eval_recursive", True) + eval_overwrite = parameters.get("eval_overwrite", False) + + # Download options + download_type = parameters.get("download_type", "archives") + phase = parameters.get("phase", "perception.object_recognition.tracking.objects") + skip_large_file = parameters.get("skip_large_file", False) + large_file_mb = float(parameters.get("large_file_mb", 50.0)) + keep_zip_files = parameters.get("keep_zip_files", False) + + # Evaluator polling options + poll_interval = float(parameters.get("poll_interval", 60.0)) + max_wait_seconds = float(parameters.get("max_wait_seconds", 3600.0 * 24 * 7)) # 1 week default + download_ready_timeout = float(parameters.get("download_ready_timeout", 1800.0)) + download_ready_poll_interval = float( + parameters.get("download_ready_poll_interval", min(max(poll_interval, 10.0), 60.0)) + ) + + # Scheduling options + max_retries = parameters.get("max_retries", 1) + clean_build = parameters.get("clean_build", False) + debug = parameters.get("debug", False) + is_tag = parameters.get("is_tag", False) + release = bool(parameters.get("release", False)) + record_caret = bool(parameters.get("record_caret", False)) + log_expiration_time_in_days = float(parameters.get("log_expiration_time_in_days", 14.0)) + + has_source_job = bool(source_job_id) + has_fresh_source = bool(integration_id and target_name) + if not project_id or not catalog_id or not output_path or (not has_source_job and not has_fresh_source): + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message="Missing required parameters", + create_missing=True, + ) + update_task_status(task_id, "failed", error_message="Missing required parameters") + return + + environment = parameters.get("environment", "default") + os.environ["AUTH_PROFILE"] = environment + os.environ["EVALUATOR_ENVIRONMENT"] = environment + + def on_progress(msg: str) -> None: + append_task_log(task_id, msg) + _append_run_event(task_id, parameters, task_type="run_evaluator_and_process", message=msg) + update_task_progress(task_id, message=msg) + + def on_warning(msg: str) -> None: + append_task_log(task_id, f"WARNING: {msg}") + _append_run_event(task_id, parameters, task_type="run_evaluator_and_process", message=f"WARNING: {msg}") + + # Step 1: Schedule evaluator job + on_progress("Step 1/5: Scheduling evaluator job...") + if source_job_id: + append_task_log( + task_id, + f"Project: {project_id}, Catalog: {catalog_id}, Reuse build from job: {source_job_id}", + ) + else: + append_task_log(task_id, f"Project: {project_id}, Catalog: {catalog_id}, Target: {target_name}") + + try: + api = evaluator_api.EvaluationRunAPI() + + result = api.schedule_job( + project_id=project_id, + catalog_id=catalog_id, + integration_id=integration_id, + target_name=target_name, + source_job_id=source_job_id, + suite_ids=suite_ids, + max_retries=max_retries, + description=description, + clean_build=clean_build, + debug=debug, + release=release, + record_caret=record_caret, + log_expiration_time_in_days=log_expiration_time_in_days, + is_tag=is_tag, + ) + except Exception as e: + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message=f"Failed to schedule evaluator job: {e}", + create_missing=True, + ) + update_task_status(task_id, "failed", error_message=f"Failed to schedule evaluator job: {e}") + return + + job_id = result.get("job_id") + if not job_id: + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message="No job_id returned from evaluator API", + create_missing=True, + ) + update_task_status(task_id, "failed", error_message="No job_id returned from evaluator API") + return + + report_url = evaluator_api.get_job_report_url(project_id, job_id) + append_task_log(task_id, f"Scheduled evaluator job: {job_id}") + append_task_log(task_id, f"Report URL: {report_url}") + update_task_progress(task_id, message=f"Evaluator job scheduled: {job_id}", pct=5) + summary = { + "job": "run_evaluator_and_process", + "evaluator_job_id": job_id, + "evaluator_report_url": report_url, + "evaluator_status": "scheduled", + "source_job_id": source_job_id or "", + "download_summary": {"total": 0, "success": 0, "failed": 0}, + "eval_summary": {}, + "parquet_path": "", + } + update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + create_missing=True, + extra={ + "evaluator": { + "job_id": job_id, + "report_url": report_url, + "status": "scheduled", + "catalog_id": catalog_id, + "integration_id": integration_id or "", + "source_job_id": source_job_id or "", + "target_name": target_name or "", + "description": description or "", + "is_tag": bool(is_tag), + "title": description or "", + } + }, + ) + + if trend_metadata: + try: + write_trend_metadata(output_path, trend_metadata) + append_task_log(task_id, "Saved release trend metadata.") + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "trend": { + "enabled": True, + "role": trend_role, + "metadata": trend_metadata, + } + }, + ) + except Exception as e: + append_task_log(task_id, f"WARNING: Could not save release trend metadata: {e}") + + # Step 2: Poll for evaluator completion + on_progress("Step 2/5: Waiting for evaluator to complete...") + append_task_log(task_id, "This may take a while depending on evaluator queue and run time...") + last_suite_snapshot = {"key": None, "time": 0.0} + + def on_eval_progress(status: str, elapsed: float) -> None: + hours = elapsed / 3600 + msg = f"Evaluator status: {status} (elapsed: {hours:.1f}h)" + append_task_log(task_id, msg) + # Progress: 5% to 40% during evaluation wait + pct = min(5 + (elapsed / max_wait_seconds) * 35, 40) + update_task_progress(task_id, message=f"Evaluator: {status} ({hours:.1f}h elapsed)", pct=pct) + summary["evaluator_status"] = status + + should_snapshot = elapsed < 60 or (elapsed - last_suite_snapshot["time"]) >= 600 + if not should_snapshot: + update_task_result_summary(task_id, summary) + return + + try: + suite_rows = api.get_suite_summary(project_id, job_id, use_available_case_results=True) + except Exception: + update_task_result_summary(task_id, summary) + return + + suite_summary = _summarize_suite_reports(suite_rows) + totals = _suite_case_totals(suite_rows) + snapshot_key = ( + totals["total"], + totals["success"], + totals["failed"], + totals["canceled"], + tuple((row["suite_name"], row["failed"]) for row in suite_summary if row["failed"] > 0), + ) + last_suite_snapshot["time"] = elapsed + if snapshot_key == last_suite_snapshot["key"]: + summary["evaluator_case_totals"] = totals + summary["evaluator_suites"] = suite_summary + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "evaluator": { + "status": status, + "case_totals": totals, + "suites": suite_summary, + } + }, + ) + update_task_result_summary(task_id, summary) + return + + last_suite_snapshot["key"] = snapshot_key + summary["evaluator_case_totals"] = totals + summary["evaluator_suites"] = suite_summary + if totals["total"] > 0: + failing = [row for row in suite_summary if row["failed"] > 0] + if failing: + top = ", ".join(f"{row['suite_name']}={row['failed']}" for row in failing[:3]) + append_task_log( + task_id, + ( + "Evaluator progress snapshot: " + f"{totals['success']}/{totals['total']} success, " + f"{totals['failed']} failed, {totals['canceled']} canceled. " + f"Failing suites: {top}" + ), + ) + else: + append_task_log( + task_id, + ( + "Evaluator progress snapshot: " + f"{totals['success']}/{totals['total']} success, " + f"{totals['failed']} failed, {totals['canceled']} canceled." + ), + ) + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "evaluator": { + "status": status, + "case_totals": totals, + "suites": suite_summary, + } + }, + ) + update_task_result_summary(task_id, summary) + + try: + final_report = api.wait_for_job_completion( + project_id=project_id, + job_id=job_id, + poll_interval=poll_interval, + max_wait_seconds=max_wait_seconds, + on_check=on_eval_progress, + ) + except evaluator_api.EvaluationAPIError as e: + append_task_log(task_id, f"Evaluator wait error: {e}") + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message=f"Evaluator failed or timed out: {e}", + ) + update_task_status(task_id, "failed", error_message=f"Evaluator failed or timed out: {e}") + return + + test_status = evaluator_api.extract_job_status(final_report) + try: + suite_rows = api.get_suite_summary(project_id, job_id, use_available_case_results=True) + except Exception as e: + append_task_log(task_id, f"Could not fetch suite summary: {e}") + suite_rows = [] + try: + case_reports = api.get_case_reports(project_id, job_id) + except Exception as e: + append_task_log(task_id, f"Could not fetch case reports: {e}") + case_reports = [] + + if trend_metadata and trend_role == "devops": + try: + _write_devops_trend_summary_from_suites(suite_rows) + append_task_log(task_id, "Saved DevOps trend summary.") + except Exception as e: + append_task_log(task_id, f"WARNING: Could not save DevOps trend summary: {e}") + + evaluator_summary = _build_evaluator_result_summary( + job_id=job_id, + report_url=report_url, + evaluator_status=test_status, + final_report=final_report, + suite_rows=suite_rows, + failed_cases=case_reports, + ) + summary.update(evaluator_summary) + update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "evaluator": { + "job_id": job_id, + "report_url": report_url, + "status": test_status, + "title": summary.get("evaluator_title", ""), + "scheduled_by": summary.get("evaluator_scheduled_by", ""), + "catalog_id": summary.get("evaluator_catalog_id", ""), + "catalog_name": summary.get("evaluator_catalog_name", ""), + "catalog_version_id": summary.get("evaluator_catalog_version_id", ""), + "catalog_url": summary.get("evaluator_catalog_url", ""), + "target": summary.get("evaluator_target", ""), + "git_sha": summary.get("evaluator_git_sha", ""), + "git_ref_url": summary.get("evaluator_git_ref_url", ""), + "git_commit_url": summary.get("evaluator_git_commit_url", ""), + "source_url": summary.get("evaluator_source_url", ""), + "source_repo_label": summary.get("evaluator_source_repo_label", ""), + "build_status": summary.get("evaluator_build_status", ""), + "test_status": summary.get("evaluator_test_status", ""), + "fail_message": summary.get("evaluator_fail_message", ""), + "case_totals": summary.get("evaluator_case_totals", {}), + "suites": summary.get("evaluator_suites", []), + "failed_cases": summary.get("evaluator_failed_cases", []), + } + }, + ) + + fail_message = summary.get("evaluator_fail_message", "") + if evaluator_api.is_success_job_status(test_status): + update_task_progress(task_id, message="Evaluator completed successfully", pct=40) + append_task_log(task_id, f"Evaluator completed with status: {test_status}") + else: + append_task_log(task_id, f"Evaluator completed with non-success status: {test_status}") + if fail_message: + append_task_log(task_id, f"Evaluator fail message: {fail_message}") + case_totals = summary.get("evaluator_case_totals", {}) + append_task_log( + task_id, + ( + "Evaluator result summary: " + f"{case_totals.get('success', 0)}/{case_totals.get('total', 0)} success, " + f"{case_totals.get('failed', 0)} failed, {case_totals.get('canceled', 0)} canceled" + ), + ) + failed_cases = summary.get("evaluator_failed_cases", []) + for case in failed_cases[:5]: + detail = case.get("fail_message", "") or case.get("status", "") + append_task_log( + task_id, + f"Failed case: {case.get('suite_name', '')} / {case.get('scenario_name', '')} - {detail}", + ) + update_task_progress(task_id, message=f"Evaluator finished with status {test_status}; trying download", pct=40) + + # Step 3: Download results + on_progress("Step 3/5: Downloading results...") + update_task_progress(task_id, message="Downloading results...", pct=45) + + download_deadline = time.time() + download_ready_timeout + while True: + try: + dl_result = download_core.run_download_results( + project_id=project_id, + job_id=job_id, + suite_id=None, + output_path=output_path, + download_type=download_type, + phase=phase, + skip_large_file=skip_large_file, + large_file_mb=large_file_mb, + keep_zip_files=keep_zip_files, + suite_ids=suite_ids, + on_progress=on_progress, + on_warning=on_warning, + ) + failure_count, total_attempted, rows = dl_result + success_count = total_attempted - failure_count + download_success = success_count > 0 + + if not download_success: + evaluator_msg = "" + if not evaluator_api.is_success_job_status(test_status): + evaluator_msg = f" Evaluator status was {test_status}." + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message=f"Download failed: {failure_count} of {total_attempted} scenarios failed.{evaluator_msg}", + result_path=output_path, + ) + update_task_status(task_id, "failed", + error_message=f"Download failed: {failure_count} of {total_attempted} scenarios failed.{evaluator_msg}") + return + break + + except RuntimeError as e: + if "No case reports found" not in str(e) or time.time() >= download_deadline: + evaluator_msg = "" + if not evaluator_api.is_success_job_status(test_status): + evaluator_msg = ( + f" Evaluator status was {test_status}. " + "This usually means the job failed before producing downloadable case logs." + ) + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message=f"Download failed: {e}{evaluator_msg}", + result_path=output_path, + ) + update_task_status(task_id, "failed", error_message=f"Download failed: {e}{evaluator_msg}") + return + + wait_seconds = min( + download_ready_poll_interval, + max(1.0, download_deadline - time.time()), + ) + msg = f"Case reports are not ready yet; retrying download in {wait_seconds:.0f}s" + append_task_log(task_id, f"{msg}. Detail: {e}") + update_task_progress(task_id, message=msg, pct=45) + time.sleep(wait_seconds) + + except Exception as e: + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="failed", + error_message=f"Download failed: {e}", + result_path=output_path, + ) + update_task_status(task_id, "failed", error_message=f"Download failed: {e}") + return + + update_task_progress(task_id, message=f"Download complete: {success_count}/{total_attempted} succeeded", pct=60) + summary["download_summary"] = { + "total": total_attempted, + "success": success_count, + "failed": failure_count, + } + summary["download_rows"] = rows[:500] + update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "download": { + "mode": "run_evaluator_and_process", + "total": total_attempted, + "success": success_count, + "failed": failure_count, + "download_type": download_type, + "phase": phase, + "skip_large_file": bool(skip_large_file), + "large_file_mb": large_file_mb, + "keep_zip_files": bool(keep_zip_files), + "rows": rows[:100], + } + }, + ) + + # Step 4: Run eval + if run_eval: + on_progress("Step 4/5: Running evaluation...") + update_task_progress(task_id, message="Running evaluation...", pct=65) + + target_dirs = eval_summary.find_eval_result_dirs(output_path, recursive=eval_recursive) + if target_dirs: + total = len(target_dirs) + eval_statuses = _run_eval_result_dirs( + task_id=task_id, + eval_summary=eval_summary, + target_dirs=target_dirs, + overwrite=eval_overwrite, + eval_workers=_eval_worker_count(parameters, total), + pct_start=65.0, + pct_end=85.0, + label="Eval", + ) + + # Generate summary CSVs + update_task_progress(task_id, message="Generating Summary.csv / Score.csv", pct=85) + csv_info = eval_summary.generate_summary_and_score_csv(output_path) + failed = [s for s in eval_statuses if s.get("status") == "failed"] + skipped = [s for s in eval_statuses if s.get("status") == "skipped"] + succeeded = [s for s in eval_statuses if s.get("status") == "success"] + + eval_result_summary = { + "directories_processed": total, + "success": len(succeeded), + "failed": len(failed), + "skipped": len(skipped), + "summary_path": csv_info.get("summary_path", output_path), + "summary_rows": csv_info.get("summary_rows", 0), + "score_rows": csv_info.get("score_rows", 0), + } + append_task_log(task_id, f"Eval complete: {len(succeeded)}/{total} succeeded") + else: + eval_result_summary = {"directories_processed": 0, "success": 0, "failed": 0, "skipped": 0} + append_task_log(task_id, "No eval result directories found") + else: + eval_result_summary = {} + + update_task_progress(task_id, message="Evaluation complete", pct=85) + summary["eval_summary"] = eval_result_summary + update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "evaluation": { + **eval_result_summary, + "enabled": bool(run_eval), + "recursive": bool(eval_recursive), + "overwrite": bool(eval_overwrite), + } + }, + ) + + # Step 5: Generate parquet + parquet_path = "" + if generate_parquet and pkl_archive_to_parquet: + on_progress("Step 5/5: Generating parquet...") + update_task_progress(task_id, message="Generating parquet...", pct=90) + + try: + parquet_path = pkl_archive_to_parquet( + output_path, + on_progress=_parquet_progress_callback( + task_id, + prefix="Parquet", + pct_start=90, + pct_end=99, + ), + on_skip=lambda path, reason: append_task_log( + task_id, + f"Parquet skipped {path}: {reason}", + ), + project_id=project_id, + job_id=job_id, + ) + update_task_progress(task_id, message="Parquet generated", pct=99) + append_task_log(task_id, f"Parquet generated: {parquet_path}") + except Exception as e: + append_task_log(task_id, f"Parquet generation failed: {e}") + parquet_path = "" + + update_task_progress(task_id, message="All steps complete", pct=100) + summary["parquet_path"] = parquet_path + + # Build final summary + update_task_result_summary(task_id, summary) + _update_run_metadata( + task_id, + parameters, + task_type="run_evaluator_and_process", + extra={ + "parquet": { + "enabled": bool(generate_parquet), + "path": parquet_path, + } + }, + ) + if evaluator_api.is_success_job_status(test_status): + append_task_log(task_id, "Workflow complete!") + else: + append_task_log(task_id, "Workflow complete. Evaluator job had failed test cases, but downloadable results were processed.") + _mark_run_status( + task_id, + parameters, + task_type="run_evaluator_and_process", + status="completed", + result_path=output_path, + ) + update_task_status(task_id, "completed", result_path=output_path) + except Exception as e: append_task_log(task_id, f"Failed: {e}") + _mark_run_status(task_id, parameters, task_type="run_evaluator_and_process", status="failed", error_message=str(e)) update_task_status(task_id, "failed", error_message=str(e)) raise @@ -277,6 +2548,9 @@ def job_download_scenarios(task_id: str, parameters: Dict[str, Any]) -> None: "build_parquet": job_build_parquet, "download_results": job_download_results, "download_scenarios": job_download_scenarios, + "download_and_eval": job_download_and_eval, + "run_release_specsheet_workflow": job_run_release_specsheet_workflow, + "run_evaluator_and_process": job_run_evaluator_and_process, }