Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion reports/technical_risk_register.md
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,9 @@ No silent corruption and no current breakage (development is green on 2.3.0) →

`unfao/frames.py` (`to_prediction_frame` / `to_target_frame`) converts the repo's pandas tables into views-frames `PredictionFrame`/`TargetFrame` `(N, 1)` value objects to prove they satisfy the published views-frames contract. It is the **only** module importing `views_frames`, and **no live path calls it** — its sole consumer is the conformance test (the module's own docstring states "nothing in the live delivery path calls it yet"). It is forward-looking scaffolding for the C-40 frame migration: harmless, but a maintenance/confusion surface (a reader can mistake it for an active code path, and it hardcodes `S=1`, i.e. point-only, which will need revisiting for the draws/uncertainty work). No correctness or reliability impact → **Tier 4**.

See also C-40 (the pandas gate this adapter anticipates), #45 (the draws carrier that will need the `S>1` version).
**Partly addressed by S1 (epic #85 / #86, 2026-06-28).** `frames.py` was reworked to the declare-don't-guess design (D-11): it now exposes `build_prediction_frame` / `build_target_frame` over **declared primitives** (a 2-D `(N, S)` array + `(time, unit)`), supports **S>1** (the `S=1` hardcode is gone), is **pandas-free** (no longer the lone pandas importer — it sits alongside `delivery/`), and fails loud rather than inferring/reshaping. So the *S=1-hardcode*, *inference-risk*, and *pandas-coupling* dimensions are resolved. **Residual (entry stays open):** the module is still **not called by the live delivery path** — that wiring is S3 (#88, forecast convert-at-the-door). C-45 resolves when S3 lands (or, if S3 is abandoned, when the module is removed).

See also C-40 (the pandas gate this adapter anticipates), #45 (the draws carrier that will need the `S>1` version), epic **#85** / **#86** (the S1 rework) / **#88** (the S3 wiring that closes this), **D-11** (the declare-don't-guess decision).

---

Expand Down
149 changes: 73 additions & 76 deletions tests/test_views_frames_conformance.py
Original file line number Diff line number Diff line change
@@ -1,96 +1,93 @@
"""Conformance proof: this repo's data satisfies the views-frames contract.
"""Conformance + parity tests for the views-frames constructors (`unfao/frames.py`).

Phase 1 of the numpy/views-frames migration. Builds the repo's two kinds of
data as views-frames value objects via the ``unfao.frames`` adapters and runs
the published ``assert_frame_contract`` against each:
`build_prediction_frame` / `build_target_frame` take **declared primitives** — a 2-D
`(N, S)` value array + `(time, unit)` arrays — and build a views-frames value object. These
tests prove (1) the constructors carry the declared values faithfully (the parity oracle:
build primitives → construct → assert arrays equal), (2) the result satisfies the published
`assert_frame_contract`, and (3) the constructors **fail loud** rather than infer/reshape
when the declared shape is wrong.

forecasts (pred_ln_*) -> PredictionFrame (N, 1)
actuals (ged_*) -> TargetFrame (N, 1)

This makes views-postprocessing the first live cross-repo consumer of the
frozen views-frames 1.0 contract. Offline and synthetic — no zarr, no Appwrite,
no network. ``assert_frame_contract`` checks the structural invariants
(float32, explicit trailing axis, integer identifiers of length N) and a
save/load round-trip; it raises on any violation.
No pandas: the pandas→primitives unpacking lives in the extraction seam (a separate story);
this module only exercises the primitives→frame boundary.
"""

import numpy as np
import pandas as pd
import pytest

from views_frames import PredictionFrame, TargetFrame
from views_frames.conformance import assert_frame_contract

from views_postprocessing.unfao.frames import to_prediction_frame, to_target_frame

_TIME_ID = "month_id"
_PG_ID = "priogrid_gid"

from views_postprocessing.unfao.frames import build_prediction_frame, build_target_frame

def _frame(months=(100, 101), gids=(1, 2, 3)):
"""Synthetic (month_id, priogrid_gid)-indexed frame with a pred and a ged column."""
rows = [(m, g) for m in months for g in gids]
idx = pd.MultiIndex.from_tuples(rows, names=[_TIME_ID, _PG_ID])
rng = np.random.default_rng(0)
return pd.DataFrame(
{
"pred_ln_sb_best": rng.gamma(2.0, 1.0, size=len(rows)),
"lr_ged_sb": rng.integers(0, 50, size=len(rows)),
},
index=idx,
)
# Declared identifiers for a small PGM block: 3 cells across 2 months (N = 6).
_TIME = np.array([100, 100, 100, 101, 101, 101], dtype=np.int64)
_UNIT = np.array([1, 2, 3, 1, 2, 3], dtype=np.int64)
_N = _TIME.shape[0]


class TestPredictionFrameConformance:
def test_satisfies_frame_contract(self):
pf = to_prediction_frame(_frame(), "pred_ln_sb_best")
assert_frame_contract(pf) # raises on any violation (incl. round-trip)

def test_shape_and_sample_axis(self):
df = _frame()
pf = to_prediction_frame(df, "pred_ln_sb_best")
class TestPredictionFramePoint:
def test_point_is_single_sample(self):
values = np.arange(_N, dtype=np.float32).reshape(_N, 1) # declared (N, 1)
pf = build_prediction_frame(values, _TIME, _UNIT)
assert isinstance(pf, PredictionFrame)
assert pf.n_rows == len(df)
assert pf.sample_count == 1 # scalar point predictions -> explicit S=1
assert pf.sample_count == 1
assert pf.is_sample is False
assert pf.values.dtype == np.float32

def test_identifiers_match_our_index_flat_columns(self):
df = _frame().reset_index() # also exercise the reset_index() column path
pf = to_prediction_frame(df, "pred_ln_sb_best")
np.testing.assert_array_equal(pf.index.time, df[_TIME_ID].to_numpy())
np.testing.assert_array_equal(pf.index.unit, df[_PG_ID].to_numpy())

def test_values_come_from_the_named_column(self):
# Guards against wiring the wrong column: values must equal the source.
df = _frame()
pf = to_prediction_frame(df, "pred_ln_sb_best")
np.testing.assert_array_equal(
pf.values[:, 0], df["pred_ln_sb_best"].to_numpy().astype(np.float32)
)


class TestTargetFrameConformance:
def test_satisfies_frame_contract(self):
tf = to_target_frame(_frame(), "lr_ged_sb")
assert_frame_contract(tf)

def test_observed_actuals_shape(self):
df = _frame()
tf = to_target_frame(df, "lr_ged_sb")
assert_frame_contract(pf)

def test_point_values_carried_faithfully(self):
values = np.array([[0.0], [1.5], [2.5], [3.5], [4.5], [5.5]], dtype=np.float32)
pf = build_prediction_frame(values, _TIME, _UNIT)
np.testing.assert_array_equal(pf.values, values)


class TestPredictionFrameSamples:
def test_samples_carry_the_full_NxS_array(self):
S = 5
rng = np.random.default_rng(0)
values = rng.gamma(2.0, size=(_N, S)).astype(np.float32) # declared (N, S)
pf = build_prediction_frame(values, _TIME, _UNIT)
assert pf.sample_count == S
assert pf.is_sample is True
np.testing.assert_array_equal(pf.values, values) # parity: no collapse, no reshape
assert_frame_contract(pf)

def test_identifiers_match_declared_arrays(self):
values = np.zeros((_N, 3), dtype=np.float32)
pf = build_prediction_frame(values, _TIME, _UNIT)
np.testing.assert_array_equal(pf.index.time, _TIME)
np.testing.assert_array_equal(pf.index.unit, _UNIT)


class TestTargetFrame:
def test_target_is_single_valued(self):
values = np.arange(_N, dtype=np.float32).reshape(_N, 1)
tf = build_target_frame(values, _TIME, _UNIT)
assert isinstance(tf, TargetFrame)
assert tf.n_rows == len(df)
assert tf.values.shape == (len(df), 1)
assert tf.sample_count == 1
assert tf.is_sample is False
np.testing.assert_array_equal(tf.values, values)
assert_frame_contract(tf)


class TestFailLoudNotInfer:
"""The constructors declare-and-assert; they never reshape/stack/guess."""

def test_1d_values_raise_not_reshaped(self):
with pytest.raises(ValueError):
build_prediction_frame(np.arange(_N, dtype=np.float32), _TIME, _UNIT)

def test_object_dtype_cells_raise_not_stacked(self):
# an object-dtype column of per-cell arrays is NOT silently stacked into (N, S)
cells = np.empty(_N, dtype=object)
for i in range(_N):
cells[i] = np.array([float(i), float(i)], dtype=np.float32)
with pytest.raises((ValueError, TypeError)):
build_prediction_frame(cells, _TIME, _UNIT)

class TestAdapterGuards:
def test_missing_value_column_raises(self):
with pytest.raises(ValueError, match="not found"):
to_prediction_frame(_frame(), "does_not_exist")
def test_target_with_sample_axis_raises(self):
with pytest.raises(ValueError):
build_target_frame(np.zeros((_N, 2), dtype=np.float32), _TIME, _UNIT)

def test_missing_identifiers_raise(self):
df = pd.DataFrame({"pred_ln_sb_best": [1.0, 2.0]}) # no month_id/priogrid_gid
with pytest.raises(ValueError, match="month_id"):
to_prediction_frame(df, "pred_ln_sb_best")
def test_row_count_mismatch_raises(self):
values = np.zeros((_N + 1, 1), dtype=np.float32)
with pytest.raises(ValueError):
build_prediction_frame(values, _TIME, _UNIT)
125 changes: 38 additions & 87 deletions views_postprocessing/unfao/frames.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,28 @@
"""views-frames adapters — phase 1 of the numpy/views-frames migration.

Turn this repo's pandas prediction/target tables into ``views-frames`` value
objects so they can be checked against the published views-frames data contract
(``assert_frame_contract``). This is the **seam** through which frames enter
views-postprocessing; nothing in the live delivery path calls it yet (the
manager still moves pandas DataFrames — that is wired in a later phase).

The repo carries two kinds of data, which map to two frame types:

forecasts (``pred_ln_*``) -> PredictionFrame (N, 1) point preds
observed actuals (``ged_*``) -> TargetFrame (N, 1) ground truth

Both sit at PGM level: ``time = month_id``, ``unit = priogrid_gid``. Our
forecasts are scalar point values, so the explicit trailing sample axis is
``S = 1`` (ADR-012); a ``TargetFrame`` is ``(N, 1)`` by definition.

numpy-at-the-boundary: this module reads a few columns out of a pandas frame
and hands numpy arrays to views-frames. It performs no merges and adds no
geography — that stays in :mod:`enrichment` until a later migration phase.

These adapters **convert**, they do not **validate**: a NaN value flows through
into the frame (the views-frames contract permits NaN — only object dtype is
banned). Null-guarding remains the manager's ``_validate`` gate, the single
fail-loud place a delivery is refused; an adapter is not that gate.
"""views-frames constructors — build views-frames value objects from **declared primitives**.

This module is the boundary where this repo hands data to the views-frames foundation. It
takes explicit, declared primitives — a 2D ``(N, S)`` value array plus the ``(time, unit)``
identifier arrays — and constructs the corresponding views-frames value object.

**Declare, don't guess.** The caller states the shape: a point prediction is simply
``S = 1`` (an ``(N, 1)`` array); a sampled prediction is ``(N, S)``. This module does **not**
inspect dtypes to infer the sample axis and does **not** reshape a 1-D column or stack
object-dtype cells — a mismatched input fails loud (views-frames rejects anything that is
not an explicit 2-D ``(N, S)`` array). Producing these primitives *from* this repo's pandas
tables is the job of the extraction seam (the one pandas-aware place); keeping that knowledge
out of here leaves this module **representation-free**, like ``views_postprocessing/delivery/``.

forecasts -> build_prediction_frame(values (N, S), time, unit) # S >= 1
observed actuals -> build_target_frame(values (N, 1), time, unit) # S == 1

Both sit at PGM level (``time = month_id``, ``unit = priogrid_gid``). These constructors
convert + validate the *shape* (via views-frames' contract); they do not null-guard —
that remains the manager's ``_validate`` gate.
"""

from __future__ import annotations

import numpy as np
import pandas as pd

from views_frames import (
FrameMetadata,
Expand All @@ -38,78 +32,35 @@
TargetFrame,
)

_TIME_ID = "month_id"
_PG_ID = "priogrid_gid"


def _extract_ids(
df: pd.DataFrame, time_id_col: str, pg_id_col: str
) -> tuple[np.ndarray, np.ndarray]:
"""Pull ``(time, unit)`` arrays from columns *or* a MultiIndex.

Accepts both shapes the repo produces: the manager's index-keyed frame
(``MultiIndex[(month_id, priogrid_gid)]``) and a ``reset_index()`` flat frame.
"""
if time_id_col in df.columns and pg_id_col in df.columns:
return df[time_id_col].to_numpy(), df[pg_id_col].to_numpy()
names = list(df.index.names)
if time_id_col in names and pg_id_col in names:
return (
df.index.get_level_values(time_id_col).to_numpy(),
df.index.get_level_values(pg_id_col).to_numpy(),
)
raise ValueError(
f"DataFrame must carry '{time_id_col}' and '{pg_id_col}' as columns or "
f"index levels; got columns={list(df.columns)}, index names={names}"
)


def _pgm_index(
df: pd.DataFrame, time_id_col: str, pg_id_col: str
) -> SpatioTemporalIndex:
"""A PGM ``SpatioTemporalIndex`` from this frame's ``(month_id, priogrid_gid)``."""
time, unit = _extract_ids(df, time_id_col, pg_id_col)
def _pgm_index(time, unit) -> SpatioTemporalIndex:
"""A PGM ``SpatioTemporalIndex`` from declared ``(time, unit)`` arrays (int64)."""
return SpatioTemporalIndex(
time=np.asarray(time, dtype=np.int64),
unit=np.asarray(unit, dtype=np.int64),
level=SpatialLevel.PGM,
)


def _column_2d(df: pd.DataFrame, value_col: str) -> np.ndarray:
"""One value column as an ``(N, 1)`` array (the explicit trailing axis)."""
if value_col not in df.columns:
raise ValueError(
f"value column '{value_col}' not found; columns={list(df.columns)}"
)
return df[value_col].to_numpy().reshape(-1, 1)


def to_prediction_frame(
df: pd.DataFrame,
value_col: str,
*,
time_id_col: str = _TIME_ID,
pg_id_col: str = _PG_ID,
metadata: FrameMetadata | None = None,
def build_prediction_frame(
values, time, unit, *, metadata: FrameMetadata | None = None
) -> PredictionFrame:
"""Wrap one forecast column as a ``PredictionFrame`` ``(N, 1)`` at PGM level.
"""Build a PGM ``PredictionFrame`` from a declared ``(N, S)`` array + ``(time, unit)``.

The values coerce to float32 inside views-frames; scalar point predictions
become a single-sample axis (``sample_count == 1``).
``values`` must be a 2-D array with one row per ``(time, unit)`` and an explicit trailing
sample axis ``S`` (a point prediction declares ``S = 1``). A 1-D array, an object-dtype
array of per-cell sample arrays, or a row-count that disagrees with the index all **raise**
— the caller must declare the ``(N, S)`` shape, this constructor will not infer it.
"""
index = _pgm_index(df, time_id_col, pg_id_col)
return PredictionFrame(_column_2d(df, value_col), index, metadata)
return PredictionFrame(np.asarray(values, dtype=np.float32), _pgm_index(time, unit), metadata)


def to_target_frame(
df: pd.DataFrame,
value_col: str,
*,
time_id_col: str = _TIME_ID,
pg_id_col: str = _PG_ID,
metadata: FrameMetadata | None = None,
def build_target_frame(
values, time, unit, *, metadata: FrameMetadata | None = None
) -> TargetFrame:
"""Wrap one observed-actuals column (e.g. ``ged_sb_best``) as a ``TargetFrame``."""
index = _pgm_index(df, time_id_col, pg_id_col)
return TargetFrame(_column_2d(df, value_col), index, metadata)
"""Build a PGM ``TargetFrame`` from a declared ``(N, 1)`` array + ``(time, unit)``.

Ground truth is single-valued: ``values`` must be ``(N, 1)`` (``S == 1``); any other shape
raises (views-frames' ``TargetFrame`` contract).
"""
return TargetFrame(np.asarray(values, dtype=np.float32), _pgm_index(time, unit), metadata)
Loading