diff --git a/reports/technical_risk_register.md b/reports/technical_risk_register.md index 0c5ba04..742dead 100644 --- a/reports/technical_risk_register.md +++ b/reports/technical_risk_register.md @@ -527,7 +527,9 @@ No silent corruption and no current breakage (development is green on 2.3.0) → `unfao/frames.py` (`to_prediction_frame` / `to_target_frame`) converts the repo's pandas tables into views-frames `PredictionFrame`/`TargetFrame` `(N, 1)` value objects to prove they satisfy the published views-frames contract. It is the **only** module importing `views_frames`, and **no live path calls it** — its sole consumer is the conformance test (the module's own docstring states "nothing in the live delivery path calls it yet"). It is forward-looking scaffolding for the C-40 frame migration: harmless, but a maintenance/confusion surface (a reader can mistake it for an active code path, and it hardcodes `S=1`, i.e. point-only, which will need revisiting for the draws/uncertainty work). No correctness or reliability impact → **Tier 4**. -See also C-40 (the pandas gate this adapter anticipates), #45 (the draws carrier that will need the `S>1` version). +**Partly addressed by S1 (epic #85 / #86, 2026-06-28).** `frames.py` was reworked to the declare-don't-guess design (D-11): it now exposes `build_prediction_frame` / `build_target_frame` over **declared primitives** (a 2-D `(N, S)` array + `(time, unit)`), supports **S>1** (the `S=1` hardcode is gone), is **pandas-free** (no longer the lone pandas importer — it sits alongside `delivery/`), and fails loud rather than inferring/reshaping. So the *S=1-hardcode*, *inference-risk*, and *pandas-coupling* dimensions are resolved. **Residual (entry stays open):** the module is still **not called by the live delivery path** — that wiring is S3 (#88, forecast convert-at-the-door). C-45 resolves when S3 lands (or, if S3 is abandoned, when the module is removed). + +See also C-40 (the pandas gate this adapter anticipates), #45 (the draws carrier that will need the `S>1` version), epic **#85** / **#86** (the S1 rework) / **#88** (the S3 wiring that closes this), **D-11** (the declare-don't-guess decision). --- diff --git a/tests/test_views_frames_conformance.py b/tests/test_views_frames_conformance.py index 1ecf68f..783e329 100644 --- a/tests/test_views_frames_conformance.py +++ b/tests/test_views_frames_conformance.py @@ -1,96 +1,93 @@ -"""Conformance proof: this repo's data satisfies the views-frames contract. +"""Conformance + parity tests for the views-frames constructors (`unfao/frames.py`). -Phase 1 of the numpy/views-frames migration. Builds the repo's two kinds of -data as views-frames value objects via the ``unfao.frames`` adapters and runs -the published ``assert_frame_contract`` against each: +`build_prediction_frame` / `build_target_frame` take **declared primitives** — a 2-D +`(N, S)` value array + `(time, unit)` arrays — and build a views-frames value object. These +tests prove (1) the constructors carry the declared values faithfully (the parity oracle: +build primitives → construct → assert arrays equal), (2) the result satisfies the published +`assert_frame_contract`, and (3) the constructors **fail loud** rather than infer/reshape +when the declared shape is wrong. - forecasts (pred_ln_*) -> PredictionFrame (N, 1) - actuals (ged_*) -> TargetFrame (N, 1) - -This makes views-postprocessing the first live cross-repo consumer of the -frozen views-frames 1.0 contract. Offline and synthetic — no zarr, no Appwrite, -no network. ``assert_frame_contract`` checks the structural invariants -(float32, explicit trailing axis, integer identifiers of length N) and a -save/load round-trip; it raises on any violation. +No pandas: the pandas→primitives unpacking lives in the extraction seam (a separate story); +this module only exercises the primitives→frame boundary. """ import numpy as np -import pandas as pd import pytest from views_frames import PredictionFrame, TargetFrame from views_frames.conformance import assert_frame_contract -from views_postprocessing.unfao.frames import to_prediction_frame, to_target_frame - -_TIME_ID = "month_id" -_PG_ID = "priogrid_gid" - +from views_postprocessing.unfao.frames import build_prediction_frame, build_target_frame -def _frame(months=(100, 101), gids=(1, 2, 3)): - """Synthetic (month_id, priogrid_gid)-indexed frame with a pred and a ged column.""" - rows = [(m, g) for m in months for g in gids] - idx = pd.MultiIndex.from_tuples(rows, names=[_TIME_ID, _PG_ID]) - rng = np.random.default_rng(0) - return pd.DataFrame( - { - "pred_ln_sb_best": rng.gamma(2.0, 1.0, size=len(rows)), - "lr_ged_sb": rng.integers(0, 50, size=len(rows)), - }, - index=idx, - ) +# Declared identifiers for a small PGM block: 3 cells across 2 months (N = 6). +_TIME = np.array([100, 100, 100, 101, 101, 101], dtype=np.int64) +_UNIT = np.array([1, 2, 3, 1, 2, 3], dtype=np.int64) +_N = _TIME.shape[0] -class TestPredictionFrameConformance: - def test_satisfies_frame_contract(self): - pf = to_prediction_frame(_frame(), "pred_ln_sb_best") - assert_frame_contract(pf) # raises on any violation (incl. round-trip) - - def test_shape_and_sample_axis(self): - df = _frame() - pf = to_prediction_frame(df, "pred_ln_sb_best") +class TestPredictionFramePoint: + def test_point_is_single_sample(self): + values = np.arange(_N, dtype=np.float32).reshape(_N, 1) # declared (N, 1) + pf = build_prediction_frame(values, _TIME, _UNIT) assert isinstance(pf, PredictionFrame) - assert pf.n_rows == len(df) - assert pf.sample_count == 1 # scalar point predictions -> explicit S=1 + assert pf.sample_count == 1 assert pf.is_sample is False - assert pf.values.dtype == np.float32 - - def test_identifiers_match_our_index_flat_columns(self): - df = _frame().reset_index() # also exercise the reset_index() column path - pf = to_prediction_frame(df, "pred_ln_sb_best") - np.testing.assert_array_equal(pf.index.time, df[_TIME_ID].to_numpy()) - np.testing.assert_array_equal(pf.index.unit, df[_PG_ID].to_numpy()) - - def test_values_come_from_the_named_column(self): - # Guards against wiring the wrong column: values must equal the source. - df = _frame() - pf = to_prediction_frame(df, "pred_ln_sb_best") - np.testing.assert_array_equal( - pf.values[:, 0], df["pred_ln_sb_best"].to_numpy().astype(np.float32) - ) - - -class TestTargetFrameConformance: - def test_satisfies_frame_contract(self): - tf = to_target_frame(_frame(), "lr_ged_sb") - assert_frame_contract(tf) - - def test_observed_actuals_shape(self): - df = _frame() - tf = to_target_frame(df, "lr_ged_sb") + assert_frame_contract(pf) + + def test_point_values_carried_faithfully(self): + values = np.array([[0.0], [1.5], [2.5], [3.5], [4.5], [5.5]], dtype=np.float32) + pf = build_prediction_frame(values, _TIME, _UNIT) + np.testing.assert_array_equal(pf.values, values) + + +class TestPredictionFrameSamples: + def test_samples_carry_the_full_NxS_array(self): + S = 5 + rng = np.random.default_rng(0) + values = rng.gamma(2.0, size=(_N, S)).astype(np.float32) # declared (N, S) + pf = build_prediction_frame(values, _TIME, _UNIT) + assert pf.sample_count == S + assert pf.is_sample is True + np.testing.assert_array_equal(pf.values, values) # parity: no collapse, no reshape + assert_frame_contract(pf) + + def test_identifiers_match_declared_arrays(self): + values = np.zeros((_N, 3), dtype=np.float32) + pf = build_prediction_frame(values, _TIME, _UNIT) + np.testing.assert_array_equal(pf.index.time, _TIME) + np.testing.assert_array_equal(pf.index.unit, _UNIT) + + +class TestTargetFrame: + def test_target_is_single_valued(self): + values = np.arange(_N, dtype=np.float32).reshape(_N, 1) + tf = build_target_frame(values, _TIME, _UNIT) assert isinstance(tf, TargetFrame) - assert tf.n_rows == len(df) - assert tf.values.shape == (len(df), 1) assert tf.sample_count == 1 - assert tf.is_sample is False + np.testing.assert_array_equal(tf.values, values) + assert_frame_contract(tf) + + +class TestFailLoudNotInfer: + """The constructors declare-and-assert; they never reshape/stack/guess.""" + + def test_1d_values_raise_not_reshaped(self): + with pytest.raises(ValueError): + build_prediction_frame(np.arange(_N, dtype=np.float32), _TIME, _UNIT) + def test_object_dtype_cells_raise_not_stacked(self): + # an object-dtype column of per-cell arrays is NOT silently stacked into (N, S) + cells = np.empty(_N, dtype=object) + for i in range(_N): + cells[i] = np.array([float(i), float(i)], dtype=np.float32) + with pytest.raises((ValueError, TypeError)): + build_prediction_frame(cells, _TIME, _UNIT) -class TestAdapterGuards: - def test_missing_value_column_raises(self): - with pytest.raises(ValueError, match="not found"): - to_prediction_frame(_frame(), "does_not_exist") + def test_target_with_sample_axis_raises(self): + with pytest.raises(ValueError): + build_target_frame(np.zeros((_N, 2), dtype=np.float32), _TIME, _UNIT) - def test_missing_identifiers_raise(self): - df = pd.DataFrame({"pred_ln_sb_best": [1.0, 2.0]}) # no month_id/priogrid_gid - with pytest.raises(ValueError, match="month_id"): - to_prediction_frame(df, "pred_ln_sb_best") + def test_row_count_mismatch_raises(self): + values = np.zeros((_N + 1, 1), dtype=np.float32) + with pytest.raises(ValueError): + build_prediction_frame(values, _TIME, _UNIT) diff --git a/views_postprocessing/unfao/frames.py b/views_postprocessing/unfao/frames.py index b9a12f0..693e8c7 100644 --- a/views_postprocessing/unfao/frames.py +++ b/views_postprocessing/unfao/frames.py @@ -1,34 +1,28 @@ -"""views-frames adapters — phase 1 of the numpy/views-frames migration. - -Turn this repo's pandas prediction/target tables into ``views-frames`` value -objects so they can be checked against the published views-frames data contract -(``assert_frame_contract``). This is the **seam** through which frames enter -views-postprocessing; nothing in the live delivery path calls it yet (the -manager still moves pandas DataFrames — that is wired in a later phase). - -The repo carries two kinds of data, which map to two frame types: - - forecasts (``pred_ln_*``) -> PredictionFrame (N, 1) point preds - observed actuals (``ged_*``) -> TargetFrame (N, 1) ground truth - -Both sit at PGM level: ``time = month_id``, ``unit = priogrid_gid``. Our -forecasts are scalar point values, so the explicit trailing sample axis is -``S = 1`` (ADR-012); a ``TargetFrame`` is ``(N, 1)`` by definition. - -numpy-at-the-boundary: this module reads a few columns out of a pandas frame -and hands numpy arrays to views-frames. It performs no merges and adds no -geography — that stays in :mod:`enrichment` until a later migration phase. - -These adapters **convert**, they do not **validate**: a NaN value flows through -into the frame (the views-frames contract permits NaN — only object dtype is -banned). Null-guarding remains the manager's ``_validate`` gate, the single -fail-loud place a delivery is refused; an adapter is not that gate. +"""views-frames constructors — build views-frames value objects from **declared primitives**. + +This module is the boundary where this repo hands data to the views-frames foundation. It +takes explicit, declared primitives — a 2D ``(N, S)`` value array plus the ``(time, unit)`` +identifier arrays — and constructs the corresponding views-frames value object. + +**Declare, don't guess.** The caller states the shape: a point prediction is simply +``S = 1`` (an ``(N, 1)`` array); a sampled prediction is ``(N, S)``. This module does **not** +inspect dtypes to infer the sample axis and does **not** reshape a 1-D column or stack +object-dtype cells — a mismatched input fails loud (views-frames rejects anything that is +not an explicit 2-D ``(N, S)`` array). Producing these primitives *from* this repo's pandas +tables is the job of the extraction seam (the one pandas-aware place); keeping that knowledge +out of here leaves this module **representation-free**, like ``views_postprocessing/delivery/``. + + forecasts -> build_prediction_frame(values (N, S), time, unit) # S >= 1 + observed actuals -> build_target_frame(values (N, 1), time, unit) # S == 1 + +Both sit at PGM level (``time = month_id``, ``unit = priogrid_gid``). These constructors +convert + validate the *shape* (via views-frames' contract); they do not null-guard — +that remains the manager's ``_validate`` gate. """ from __future__ import annotations import numpy as np -import pandas as pd from views_frames import ( FrameMetadata, @@ -38,37 +32,9 @@ TargetFrame, ) -_TIME_ID = "month_id" -_PG_ID = "priogrid_gid" - - -def _extract_ids( - df: pd.DataFrame, time_id_col: str, pg_id_col: str -) -> tuple[np.ndarray, np.ndarray]: - """Pull ``(time, unit)`` arrays from columns *or* a MultiIndex. - Accepts both shapes the repo produces: the manager's index-keyed frame - (``MultiIndex[(month_id, priogrid_gid)]``) and a ``reset_index()`` flat frame. - """ - if time_id_col in df.columns and pg_id_col in df.columns: - return df[time_id_col].to_numpy(), df[pg_id_col].to_numpy() - names = list(df.index.names) - if time_id_col in names and pg_id_col in names: - return ( - df.index.get_level_values(time_id_col).to_numpy(), - df.index.get_level_values(pg_id_col).to_numpy(), - ) - raise ValueError( - f"DataFrame must carry '{time_id_col}' and '{pg_id_col}' as columns or " - f"index levels; got columns={list(df.columns)}, index names={names}" - ) - - -def _pgm_index( - df: pd.DataFrame, time_id_col: str, pg_id_col: str -) -> SpatioTemporalIndex: - """A PGM ``SpatioTemporalIndex`` from this frame's ``(month_id, priogrid_gid)``.""" - time, unit = _extract_ids(df, time_id_col, pg_id_col) +def _pgm_index(time, unit) -> SpatioTemporalIndex: + """A PGM ``SpatioTemporalIndex`` from declared ``(time, unit)`` arrays (int64).""" return SpatioTemporalIndex( time=np.asarray(time, dtype=np.int64), unit=np.asarray(unit, dtype=np.int64), @@ -76,40 +42,25 @@ def _pgm_index( ) -def _column_2d(df: pd.DataFrame, value_col: str) -> np.ndarray: - """One value column as an ``(N, 1)`` array (the explicit trailing axis).""" - if value_col not in df.columns: - raise ValueError( - f"value column '{value_col}' not found; columns={list(df.columns)}" - ) - return df[value_col].to_numpy().reshape(-1, 1) - - -def to_prediction_frame( - df: pd.DataFrame, - value_col: str, - *, - time_id_col: str = _TIME_ID, - pg_id_col: str = _PG_ID, - metadata: FrameMetadata | None = None, +def build_prediction_frame( + values, time, unit, *, metadata: FrameMetadata | None = None ) -> PredictionFrame: - """Wrap one forecast column as a ``PredictionFrame`` ``(N, 1)`` at PGM level. + """Build a PGM ``PredictionFrame`` from a declared ``(N, S)`` array + ``(time, unit)``. - The values coerce to float32 inside views-frames; scalar point predictions - become a single-sample axis (``sample_count == 1``). + ``values`` must be a 2-D array with one row per ``(time, unit)`` and an explicit trailing + sample axis ``S`` (a point prediction declares ``S = 1``). A 1-D array, an object-dtype + array of per-cell sample arrays, or a row-count that disagrees with the index all **raise** + — the caller must declare the ``(N, S)`` shape, this constructor will not infer it. """ - index = _pgm_index(df, time_id_col, pg_id_col) - return PredictionFrame(_column_2d(df, value_col), index, metadata) + return PredictionFrame(np.asarray(values, dtype=np.float32), _pgm_index(time, unit), metadata) -def to_target_frame( - df: pd.DataFrame, - value_col: str, - *, - time_id_col: str = _TIME_ID, - pg_id_col: str = _PG_ID, - metadata: FrameMetadata | None = None, +def build_target_frame( + values, time, unit, *, metadata: FrameMetadata | None = None ) -> TargetFrame: - """Wrap one observed-actuals column (e.g. ``ged_sb_best``) as a ``TargetFrame``.""" - index = _pgm_index(df, time_id_col, pg_id_col) - return TargetFrame(_column_2d(df, value_col), index, metadata) + """Build a PGM ``TargetFrame`` from a declared ``(N, 1)`` array + ``(time, unit)``. + + Ground truth is single-valued: ``values`` must be ``(N, 1)`` (``S == 1``); any other shape + raises (views-frames' ``TargetFrame`` contract). + """ + return TargetFrame(np.asarray(values, dtype=np.float32), _pgm_index(time, unit), metadata)