From 07e77df719b04aeea31a483cb2711b4ff10ac2a3 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 01:43:49 +0200 Subject: [PATCH 01/29] docs: add eval-scenarios-table-integration design (eng + design reviewed) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phased plan to productionize the EtlPocScenarios PoC into the real EvalRunDetails scenarios table: thin store, schema columns, self-hydrating cells, ETL filtering, comparison, live updates; retire the PoC. Eng review: 5 outside-voice gaps folded in (non-terminal rendering unbuilt, useEtlColumns drops "other" columns, perf premise unmeasured → perf gate, T5 comparison is a build, CSV export path missed). Design review: focused on interaction states + filter UX (5/10 → 9/10). --- .../eval-scenarios-table-integration.md | 285 ++++++++++++++++++ 1 file changed, 285 insertions(+) create mode 100644 docs/designs/eval-scenarios-table-integration.md diff --git a/docs/designs/eval-scenarios-table-integration.md b/docs/designs/eval-scenarios-table-integration.md new file mode 100644 index 0000000000..4e3774f433 --- /dev/null +++ b/docs/designs/eval-scenarios-table-integration.md @@ -0,0 +1,285 @@ +# Eval Scenarios Table — ETL Integration + +**Created:** 2026-05-22 +**Status:** RFC — Eng-reviewed; ready to implement (Phase 1) +**Related:** [eval-etl-engine](./eval-etl-engine.md), [etl-engine](./etl-engine.md), [eval-filtering](./eval-filtering.md) +**Authors:** Arda + +--- + +## Summary + +The `EtlPocScenarios` PoC (the `/etl-poc` page) showed an evaluation run +scenarios table can be fast with a specific strategy: thin rows (identity +only), page-level bulk hydration into molecule caches, self-resolving cells, +and ETL-engine-backed predicate filtering. + +This doc covers folding that strategy into the **real** eval run scenarios +table (`EvalRunDetails/Table.tsx`) and retiring the PoC. It is **phased** — a +core production table never big-bangs. + +> **Eng review reframe.** The first draft called the data-layer swap a +> "low-risk mechanical port." That was wrong. The PoC only ever ran against +> *finished* runs — it fabricates `scenario: {status: "success"}` +> (`EtlResolvedCell.tsx:135`, `index.tsx:692`). Rendering pending / running / +> failed scenarios and a real "skeleton while pending" policy is **unbuilt +> design work**, now scoped into Phase 1. See [Resolved decisions](#resolved-decisions). + +--- + +## Current state — two implementations of one table + +| | Production (`EvalRunDetails/Table.tsx`) | PoC (`EtlPocScenarios`) | +|---|---|---| +| Store | `evaluationPreviewTableStore` — semi-full rows | `scenarioThinPaginatedStore` — `{key, id, scenarioId}` | +| Columns | backend metadata (`usePreviewColumns`) | run graph (`useEtlColumns` → `resolveMappings`) | +| Cell data | per-visible-cell fetch (`useScenarioCellValue`) | `EtlResolvedCell` from molecule caches; `useHydrateScenarios` bulk-fills per page | +| Filtering | none | predicate bar + ETL viewport-fill loop | +| Comparison | interleaved rows, 2-4 runs | single run | +| Live runs | 5s run-status poll; 15-30s `staleTime`; human-only metrics gap-fill | none — assumes terminal data | +| Fetch path | `fetchEvaluationScenarioWindow` | **same** `fetchEvaluationScenarioWindow` | + +`scenarioPaginatedStore.ts`'s own header states the intent: *"replace +`evaluationPreviewTableStore` with this once the scenarios view is on the +molecule-cache pattern."* This is that project. + +--- + +## Resolved decisions + +| # | Decision | +|---|----------| +| **Hydration shape** | Thin rows + self-hydrating cells. The thin row carries identity + `testcaseId` (comparison join key) + `status` (live-update + skeleton) — never column data. | +| **Column source** | Run graph (`data.steps` + `data.mappings`) via `resolveMappings`. **Correction:** `useEtlColumns` currently drops `group.kind === "other"` columns (`useEtlColumns.tsx:56`, "skip in the test page"). Production must keep them — that shortcut is removed in T2. | +| **Cell caching** | Same molecules over the same TanStack layer; no net regression *expected* — validated, not asserted, by the Phase 1 perf gate. | +| **D1 — phasing** | Phase the migration. Phase 1: data-layer swap. Phase 2: filtering. Phase 3: comparison + live + co-consumers. Then retire the PoC. Each phase reviewable and revertable; the table works between phases. | +| **D2 — comparison display** | Interleaved rows (today's model), not testcase-aligned columns. Compare-mode column set = shared testcase inputs + the **common-evaluator intersection** across compared runs + the standard invocation output. Reuses single-run column derivation. | +| **D3 — live updates** | Match production's modest bar: run-status poll + page invalidation while non-terminal + human-eval metrics gap-fill. No real scenario streaming. | +| **D5 — perf gate** | After Phase 1, benchmark the new table vs the current `useScenarioCellValue` table on a 1000+ scenario run with comparison on. A regression stops Phase 2. | + +**Still open — closes before Phase 2:** + +- **Filter composition** — single predicate (PoC today) vs multi-condition + AND/OR. `eval-filtering.md` specs the fuller version. Decide at Phase 2 start. + +--- + +## Architecture — target + +The production table adopts the PoC strategy in place. Four layers (per +`eval-etl-engine.md`): + +``` +EvalRunDetails table (OSS UI) + └── thin scenarios store ──> ETL filter pipeline ──> rendered viewport + (identity + join keys) (runLoop + filterTransform) (InfiniteVirtualTable) + │ + cells self-resolve from ────────┘ + molecule caches (results / metrics / testcases / traces) + bulk-hydrated per page; cell-materialized on demand +``` + +- **Source** — the thin scenarios store; reuses `fetchEvaluationScenarioWindow`. +- **Transform** — the eval-specific `filterTransform`. "Skeleton while + pending" for rows whose slices are not yet hydrated **or whose scenario has + not yet run** — these two cases are distinct and both must be handled. +- **Sink** — the rendered viewport; the loop runs until the viewport fills. +- **Cells** — `EtlResolvedCell`, resolving from molecule caches. + +--- + +## Phase 1 — data-layer swap + +The table's internals, table-only. The focus drawer and `SingleScenarioViewer` +stay on `useScenarioCellValue` (kept alive) until Phase 3. + +- **T1 — thin scenarios store.** Promote a thin `createInfiniteTableStore` + (identity + `testcaseId` + `status`) to the production store. Reuse + `fetchEvaluationScenarioWindow`. **Per-eval-type window order** (online + `descending`, auto/human `ascending`) — production already does this; + carry it over (the PoC's hardcoded `ascending` is a PoC gap, not a target). +- **T2 — schema columns.** Wire `useEtlColumns` / `resolveMappings` into + `Table.tsx`; retire `usePreviewColumns` + `tableColumnsAtomFamily`. **Remove + the "other"-column drop** so the visible column set matches today. +- **T3 — self-hydrating cells + non-terminal rendering.** `EtlResolvedCell` + + `useHydrateScenarios` + `useCellMaterialization` for the table's cells. This + is **not** purely mechanical: add real rendering for pending / running / + failed / partial scenarios, and a "skeleton while pending" policy that + distinguishes *slice-not-hydrated* from *scenario-not-run*. The PoC's + `status: "success"` fabrication is removed. + +**Perf gate (D5)** — after T1-T3 land: benchmark the new table against the +current one on a 1000+ scenario run, comparison on. Regression → stop, rethink. + +--- + +## Phase 2 — filtering + +- **T4 — filtering.** Decide filter composition first (see open decisions). + `filterSchema` derives filterable fields: columns → evaluator steps → + evaluator output schemas → typed fields + type-matched operators + (`eval-filtering.md` D4). The `filterTransform` evaluates the predicate per + row against hydrated metrics; the loop runs until the viewport fills. + **Reuse `withRateLimitRetry`** for the scan — a low-hit-ratio filter scans + many scenario + metric pages and EE throttling will 429 it (the batch-add + lesson). + +--- + +## Phase 3 — comparison, live updates, co-consumers + +- **T5 — comparison.** A **build, not a port** — the PoC has zero multi-run + code. Per compared run: a second store scope, a **schema fetch** (needed for + the common-evaluator intersection), and per-run hydration of result slices + (`testcase_id` lives on results). Then align compared runs to the *filtered* + main rows by `testcase_id` (the `mergedRows` logic exists in production — + port it over the thin/cache model). The **CSV export path** + (`Table.tsx:542`) rebuilds the merge logic and migrates here too. +- **T6 — live updates.** Run-status poll + non-terminal page invalidation + + human-eval metrics gap-fill. +- **T8 — co-consumer migration.** Migrate the focus drawer (`focusDrawerAtom`, + `FocusDrawerHeader`, `FocusDrawerSidePanel`, `FocusDrawer`) and + `SingleScenarioViewerPOC` off `useScenarioCellValue` + `evaluationPreviewTableStore`, + then delete `useScenarioCellValue`. + +--- + +## Retire the PoC + +- **T7** — delete `EtlPocScenarios/` and the `/etl-poc` routes (oss + ee). + Gated on Phase 3 parity verified. + +--- + +## Design — interaction states & filter UX + +From the design review (focused — the migration preserves the table's visual +design; these are the genuinely new design surfaces). + +**Cell states:** +- *Skeleton (not hydrated)* — reuse the PoC's `EtlSkeletonCell`: a fixed-height + placeholder bar, identical row height to a populated row, so there is no + layout jump when data lands. +- *Non-terminal scenarios (running / failed / pending)* — match production's + existing live-table rendering. Hard rule: a *running* cell must read as + in-progress, visually distinct from a missing value — never a bare "—" for a + running scenario (the user must be able to tell "computed nothing" from + "still computing"). + +**Filter states:** +- *Scanning* — a live hit-ratio counter ("Scanned N / matched M", from + `hitRatioAtom`), not a silent spinner. A picky filter scans thousands of + scenarios to surface a few; the counter explains the wait and keeps trust. +- *No match* — a real empty state: "No scenarios match this filter" + a + one-click **Clear filter** action. Not "No items found." +- *Rate-limited / scan failed* — keep the partial viewport visible with a + non-blocking inline indicator ("Filtering paused — retrying…"). Never a + blocking overlay. + +**Filter bar** — lives in the eval run details header row, following the +observability `Filters` placement. Single vs multi-predicate composition is the +open Phase 2 decision; if multi-predicate, reuse the observability filter UI. + +## Test plan + +``` +PLANNED COMPONENT TESTS +T1 thin store [GAP] page fetch → skeleton+merge; per-eval-type order +T2 schema columns [GAP][CRITICAL][REGRESSION] resolveMappings column set + == usePreviewColumns for auto/human/online, + "other" columns INCLUDED — before deleting the old path +T3 cells [GAP] resolve from caches; pending/running/failed render + [GAP] skeleton-while-pending: not-hydrated vs not-run +T4 filtering [GAP] filterSchema typed fields; filterTransform + match/no-match/pending; [→E2E] filter → rows +T5 comparison [GAP] compare-run schema fetch; testcase_id join; + common-evaluator intersection; [→E2E] compare+filter +T6 live updates [GAP] poll stops at terminal; page invalidation; gap-fill +T8 co-consumers [GAP][REGRESSION] focus drawer + scenario viewer render + after the cell swap +``` + +Pure logic — `filterTransform`, `filterSchema` derivation, the comparison +testcase-join — unit-tests in `@agenta/entities` vitest (the batch-add +harness). The two `[→E2E]` flows go to Playwright. The T2 and T8 **regression +guards are mandatory** — they protect a user-visible column set and two live +co-consumers. + +--- + +## Edge cases & constraints + +- **Non-terminal scenarios** — pending / running / failed / partial rows must + render; the PoC never did. Distinguish slice-not-hydrated from + scenario-not-run. +- **"Other" columns** — `useEtlColumns` must keep them (T2). +- **Eval types** — auto / human / online all derive columns from the run + schema; online fetches `descending`. +- **Filtered + comparison** — filtering the main run re-drives compare + alignment; a filtered-out main row drops its compare group. +- **Filter scan throttling** — reuse `withRateLimitRetry` (T4). +- **Large-run memory** — within a run, bulk-hydrate fills caches; + `useScopeChangeEviction` only evicts on run change. Verify at 10k scenarios; + per-chunk eviction exists if needed. +- **Testset mismatch** — compared runs not sharing the main testset produce an + empty testcase join (the candidate filter already guards this). +- **`EvalRunDetails` / `EvalRunDetails2` split** — do not deepen it; touched + code consolidates into `EvalRunDetails/`. + +--- + +## NOT in scope + +- **Testcase-aligned comparison columns** — interleaved rows for now (D2). +- **Real scenario streaming** — match production's poll bar (D3). +- **Cross-run filter predicates** ("main high, run B regressed") — filter the + main run only; cross-run is a v2 feature. +- **Consolidating `EvalRunDetails2`** — only the touched code moves. +- **Backend filter param** — client-side filter is v1 (`eval-filtering.md`). + +## What already exists (reused, not rebuilt) + +- The ETL engine + generic primitives (`@agenta/entities/etl`) — built and + tested this session. +- `fetchEvaluationScenarioWindow` — the scenario fetch; reused unchanged. +- `mergedRows` testcase_id-join alignment — ported, not reinvented. +- `withRateLimitRetry` — reused for the filter scan. +- The PoC's `useHydrateScenarios` / `useEtlColumns` / `EtlResolvedCell` / + `useCellMaterialization` — ported (with the corrections above). + +--- + +## Implementation tasks + +**Phase 1 — data-layer swap** +- [ ] **T1 (P1, human: ~1d / CC: ~2h)** — thin scenarios store; per-eval-type window order. +- [ ] **T2 (P1, human: ~1d / CC: ~2h)** — schema columns; keep "other" columns; **column-parity regression test** before deleting `usePreviewColumns`. +- [ ] **T3 (P1, human: ~3d / CC: ~half-day)** — self-hydrating cells **plus non-terminal scenario rendering + skeleton-while-pending** (the unbuilt part). +- [ ] **Perf gate (P1)** — benchmark vs the old table, 1000+ scenarios, comparison on. + +**Phase 2 — filtering** +- [ ] **T4 (P1, human: ~3d / CC: ~half-day)** — `filterSchema` + `filterTransform` + predicate UI + viewport-fill loop; reuse `withRateLimitRetry`. Close the composition decision first. + +**Phase 3 — comparison, live, co-consumers** +- [ ] **T5 (P1, human: ~3d / CC: ~half-day)** — comparison build: compare-run schema fetch + per-run hydration + testcase_id join + export-path migration. +- [ ] **T6 (P2, human: ~1d / CC: ~2h)** — live updates: poll + page invalidation + human gap-fill. +- [ ] **T8 (P1, human: ~1d / CC: ~2h)** — migrate focus drawer + `SingleScenarioViewer` off `useScenarioCellValue`; delete it. + +**Cleanup** +- [ ] **T7 (P2, human: ~1h / CC: ~10min)** — delete `EtlPocScenarios/` + `/etl-poc` routes once Phase 3 parity is verified. + +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | `/plan-ceo-review` | Scope & strategy | 0 | — | — | +| Codex Review | `/codex review` | Independent 2nd opinion | 0 | — | — | +| Eng Review | `/plan-eng-review` | Architecture & tests (required) | 1 | clean | 5 decisions resolved, 0 critical gaps open | +| Design Review | `/plan-design-review` | UI/UX gaps | 1 | clean | 5/10 → 9/10, 2 decisions, focused on states | +| DX Review | `/plan-devex-review` | Developer experience gaps | 0 | — | — | + +- **OUTSIDE VOICE (eng):** Claude subagent — caught 5 real gaps the section review under-weighted: T1-T3 mislabeled as "low-risk mechanical port" (non-terminal rendering is unbuilt), `useEtlColumns` drops "other" columns (guaranteed regression), the perf premise was asserted not measured (→ D5 perf gate), T5 comparison is a build not a port (+ unlisted compare-schema fetch), the CSV export path was missed. All folded into the plan. +- **ENG DECISIONS:** D1 phase the migration · D2 interleaved rows + common-evaluator intersection columns · D3 match production's live bar · D4 outside voice ran · D5 perf-validation gate after Phase 1. +- **DESIGN DECISIONS:** focused review (migration preserves the visual design) · live hit-ratio counter for filter scanning · interaction-state specs added (skeleton, non-terminal cells, filter no-match empty state, rate-limited indicator). +- **UNRESOLVED:** 1 — filter composition (single vs multi-predicate) + its UI, intentionally deferred to Phase 2 start. Phase 1 has no open decisions. +- **VERDICT:** ENG + DESIGN REVIEW CLEARED — ready to implement Phase 1. From d899b31e29b48d386c342d503139dcf4c5420059 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 01:50:46 +0200 Subject: [PATCH 02/29] =?UTF-8?q?docs:=20drop=20T1=20from=20eval-scenarios?= =?UTF-8?q?-table=20plan=20=E2=80=94=20store=20already=20thin?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reading evaluationPreviewTableStore.ts confirmed it is already a thin store: PreviewTableRow carries only identity + testcaseId + status + scenarioIndex + comparison fields (no column data), and already does per-eval-type window order. T1 ("promote a thin store") was lateral churn — re-implementing an existing store. Phase 1 collapses to the coupled T2+T3 column+cell swap against the existing store. Confirms the eng-review outside voice's finding. --- .../eval-scenarios-table-integration.md | 50 ++++++++++++------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/docs/designs/eval-scenarios-table-integration.md b/docs/designs/eval-scenarios-table-integration.md index 4e3774f433..4d850f1265 100644 --- a/docs/designs/eval-scenarios-table-integration.md +++ b/docs/designs/eval-scenarios-table-integration.md @@ -88,27 +88,37 @@ EvalRunDetails table (OSS UI) --- -## Phase 1 — data-layer swap +## Phase 1 — column + cell swap The table's internals, table-only. The focus drawer and `SingleScenarioViewer` stay on `useScenarioCellValue` (kept alive) until Phase 3. -- **T1 — thin scenarios store.** Promote a thin `createInfiniteTableStore` - (identity + `testcaseId` + `status`) to the production store. Reuse - `fetchEvaluationScenarioWindow`. **Per-eval-type window order** (online - `descending`, auto/human `ascending`) — production already does this; - carry it over (the PoC's hardcoded `ascending` is a PoC gap, not a target). +> **Implementation-time finding — T1 dropped.** Reading +> `evaluationPreviewTableStore.ts` confirmed it is *already* a thin store: +> `PreviewTableRow` carries only identity + `testcaseId` + `status` + +> `scenarioIndex` + comparison fields — zero column data — and it already does +> per-eval-type window order (line 114). The PoC's separate +> `scenarioThinPaginatedStore` exists only to drop a couple of cheap unused +> fields. **The store stays as-is — there is no T1.** The eng-review outside +> voice flagged this; a direct read confirmed it. + +Phase 1 is the **column + cell swap** in `Table.tsx`. T2 and T3 are **coupled** +— a column definition carries its own cell `render` function, so the column +source and the cell renderer swap together in one change. + - **T2 — schema columns.** Wire `useEtlColumns` / `resolveMappings` into `Table.tsx`; retire `usePreviewColumns` + `tableColumnsAtomFamily`. **Remove - the "other"-column drop** so the visible column set matches today. + the "other"-column drop** (`useEtlColumns.tsx:56`) so the visible column set + matches today. - **T3 — self-hydrating cells + non-terminal rendering.** `EtlResolvedCell` + - `useHydrateScenarios` + `useCellMaterialization` for the table's cells. This - is **not** purely mechanical: add real rendering for pending / running / - failed / partial scenarios, and a "skeleton while pending" policy that - distinguishes *slice-not-hydrated* from *scenario-not-run*. The PoC's - `status: "success"` fabrication is removed. - -**Perf gate (D5)** — after T1-T3 land: benchmark the new table against the + `useHydrateScenarios` + `useCellMaterialization`, against the existing + `evaluationPreviewTableStore` rows (keyed by `scenarioId`). **Not** purely + mechanical: add real rendering for pending / running / failed / partial + scenarios, and a "skeleton while pending" policy that distinguishes + *slice-not-hydrated* from *scenario-not-run*. The PoC's `status: "success"` + fabrication is removed. + +**Perf gate (D5)** — after T2+T3 land: benchmark the new table against the current one on a 1000+ scenario run, comparison on. Regression → stop, rethink. --- @@ -241,6 +251,8 @@ co-consumers. - The ETL engine + generic primitives (`@agenta/entities/etl`) — built and tested this session. +- `evaluationPreviewTableStore` — already a thin store (identity + `status`, + no column data, per-eval-type order); kept as-is, no swap needed (T1 dropped). - `fetchEvaluationScenarioWindow` — the scenario fetch; reused unchanged. - `mergedRows` testcase_id-join alignment — ported, not reinvented. - `withRateLimitRetry` — reused for the filter scan. @@ -251,10 +263,9 @@ co-consumers. ## Implementation tasks -**Phase 1 — data-layer swap** -- [ ] **T1 (P1, human: ~1d / CC: ~2h)** — thin scenarios store; per-eval-type window order. -- [ ] **T2 (P1, human: ~1d / CC: ~2h)** — schema columns; keep "other" columns; **column-parity regression test** before deleting `usePreviewColumns`. -- [ ] **T3 (P1, human: ~3d / CC: ~half-day)** — self-hydrating cells **plus non-terminal scenario rendering + skeleton-while-pending** (the unbuilt part). +**Phase 1 — column + cell swap** (T1 dropped — `evaluationPreviewTableStore` is already thin; see Phase 1) +- [ ] **T2 (P1, human: ~1d / CC: ~2h)** — schema columns; keep "other" columns; **column-parity regression test** before deleting `usePreviewColumns`. Lands together with T3 (coupled). +- [ ] **T3 (P1, human: ~3d / CC: ~half-day)** — self-hydrating cells **plus non-terminal scenario rendering + skeleton-while-pending** (the unbuilt part). Lands together with T2. - [ ] **Perf gate (P1)** — benchmark vs the old table, 1000+ scenarios, comparison on. **Phase 2 — filtering** @@ -281,5 +292,6 @@ co-consumers. - **OUTSIDE VOICE (eng):** Claude subagent — caught 5 real gaps the section review under-weighted: T1-T3 mislabeled as "low-risk mechanical port" (non-terminal rendering is unbuilt), `useEtlColumns` drops "other" columns (guaranteed regression), the perf premise was asserted not measured (→ D5 perf gate), T5 comparison is a build not a port (+ unlisted compare-schema fetch), the CSV export path was missed. All folded into the plan. - **ENG DECISIONS:** D1 phase the migration · D2 interleaved rows + common-evaluator intersection columns · D3 match production's live bar · D4 outside voice ran · D5 perf-validation gate after Phase 1. - **DESIGN DECISIONS:** focused review (migration preserves the visual design) · live hit-ratio counter for filter scanning · interaction-state specs added (skeleton, non-terminal cells, filter no-match empty state, rate-limited indicator). +- **D6 (implementation-time finding):** starting Phase 1 confirmed `evaluationPreviewTableStore` is already a thin store (identity + status, no column data, per-eval-type order). **T1 is dropped** — Phase 1 is the coupled T2+T3 column+cell swap against the existing store. Confirms the eng-review outside voice's "T1 re-implements an existing store" point. - **UNRESOLVED:** 1 — filter composition (single vs multi-predicate) + its UI, intentionally deferred to Phase 2 start. Phase 1 has no open decisions. -- **VERDICT:** ENG + DESIGN REVIEW CLEARED — ready to implement Phase 1. +- **VERDICT:** ENG + DESIGN REVIEW CLEARED — ready to implement Phase 1 (T2+T3). From 0cbb07f9913d86ee364c0d776a97159546db6146 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 02:02:24 +0200 Subject: [PATCH 03/29] docs: record T2 export-coupling finding from Table.tsx read Reading Table.tsx showed the CSV export path (exportResolveValue, columnLookupMap, loadAllPagesBeforeExport) is keyed off columnResult column ids, which differ from useEtlColumns keys. Phase 1 swaps display columns only; usePreviewColumns/columnResult stay alive for export until Phase 3 (T5). The "other"-column un-drop ripples into ColumnLeaf, EtlResolvedCell, and useCellMaterialization. --- .../designs/eval-scenarios-table-integration.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/designs/eval-scenarios-table-integration.md b/docs/designs/eval-scenarios-table-integration.md index 4d850f1265..8d769872c2 100644 --- a/docs/designs/eval-scenarios-table-integration.md +++ b/docs/designs/eval-scenarios-table-integration.md @@ -106,10 +106,16 @@ Phase 1 is the **column + cell swap** in `Table.tsx`. T2 and T3 are **coupled** — a column definition carries its own cell `render` function, so the column source and the cell renderer swap together in one change. -- **T2 — schema columns.** Wire `useEtlColumns` / `resolveMappings` into - `Table.tsx`; retire `usePreviewColumns` + `tableColumnsAtomFamily`. **Remove - the "other"-column drop** (`useEtlColumns.tsx:56`) so the visible column set - matches today. +- **T2 — schema columns (display only).** Wire `useEtlColumns` / + `resolveMappings` into `Table.tsx` for the **rendered** columns. **Remove the + "other"-column drop** (`useEtlColumns.tsx:56`) so the visible set matches + today — note this ripples: `ColumnLeaf["kind"]`, `EtlResolvedCell`'s + `columnKind`, and `useCellMaterialization`'s slice map all need an "other" + case. `usePreviewColumns` / `usePreviewTableData` / `columnResult` **stay + alive** — the CSV export (`exportResolveValue`, `columnLookupMap`) is keyed + off `columnResult` ids, which differ from `useEtlColumns` keys. Full + retirement of the old column path moves to Phase 3 with the export + migration (T5). Two column systems coexist transitionally (accepted under D1). - **T3 — self-hydrating cells + non-terminal rendering.** `EtlResolvedCell` + `useHydrateScenarios` + `useCellMaterialization`, against the existing `evaluationPreviewTableStore` rows (keyed by `scenarioId`). **Not** purely @@ -264,7 +270,7 @@ co-consumers. ## Implementation tasks **Phase 1 — column + cell swap** (T1 dropped — `evaluationPreviewTableStore` is already thin; see Phase 1) -- [ ] **T2 (P1, human: ~1d / CC: ~2h)** — schema columns; keep "other" columns; **column-parity regression test** before deleting `usePreviewColumns`. Lands together with T3 (coupled). +- [ ] **T2 (P1, human: ~1d / CC: ~2h)** — schema columns for the **rendered** table; keep "other" columns (ripples into `ColumnLeaf`/`EtlResolvedCell`/`useCellMaterialization`); **column-parity regression test**. Keep `usePreviewColumns`/`columnResult` alive for the export path — full retirement is Phase 3 / T5. Lands together with T3 (coupled). - [ ] **T3 (P1, human: ~3d / CC: ~half-day)** — self-hydrating cells **plus non-terminal scenario rendering + skeleton-while-pending** (the unbuilt part). Lands together with T2. - [ ] **Perf gate (P1)** — benchmark vs the old table, 1000+ scenarios, comparison on. @@ -293,5 +299,6 @@ co-consumers. - **ENG DECISIONS:** D1 phase the migration · D2 interleaved rows + common-evaluator intersection columns · D3 match production's live bar · D4 outside voice ran · D5 perf-validation gate after Phase 1. - **DESIGN DECISIONS:** focused review (migration preserves the visual design) · live hit-ratio counter for filter scanning · interaction-state specs added (skeleton, non-terminal cells, filter no-match empty state, rate-limited indicator). - **D6 (implementation-time finding):** starting Phase 1 confirmed `evaluationPreviewTableStore` is already a thin store (identity + status, no column data, per-eval-type order). **T1 is dropped** — Phase 1 is the coupled T2+T3 column+cell swap against the existing store. Confirms the eng-review outside voice's "T1 re-implements an existing store" point. +- **D7 (implementation-time finding):** reading `Table.tsx` showed the CSV export path (`exportResolveValue`, `columnLookupMap`, `loadAllPagesBeforeExport`) is keyed off `columnResult` column ids, which differ from `useEtlColumns` keys. **Phase 1 swaps display columns only** and keeps `usePreviewColumns`/`columnResult` alive for export; the old column path fully retires in Phase 3 with the export migration (T5). The "other"-column un-drop ripples into `ColumnLeaf`, `EtlResolvedCell`, and `useCellMaterialization`. - **UNRESOLVED:** 1 — filter composition (single vs multi-predicate) + its UI, intentionally deferred to Phase 2 start. Phase 1 has no open decisions. - **VERDICT:** ENG + DESIGN REVIEW CLEARED — ready to implement Phase 1 (T2+T3). From aac4b3622536178b776fd393c77f1081e4a0b37f Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 02:45:18 +0200 Subject: [PATCH 04/29] feat(oss): swap eval scenarios table to ETL run-graph columns + cells Phase 1 (T2+T3) of eval-scenarios-table-integration. The eval run scenarios table now derives its schema columns from the run graph and renders cells that self-hydrate from molecule caches, replacing the backend-metadata + per-cell-fetch display path. - groupRunColumns: pure mapping-grouping in @agenta/entities etl; keeps "other"-kind columns (the PoC dropped them) - EvalRunDetails/etl/: useEtlColumns, useHydrateScenarios, run-aware useCellMaterialization, useScopeChangeEviction, EtlColumnHeader, EtlResolvedCell - EtlResolvedCell renders pending/running/failed scenarios distinctly and distinguishes slice-not-hydrated from scenario-not-run - Table.tsx swaps only the display columns; usePreviewColumns / columnResult stay alive for the CSV export path (Phase 3) - column-parity regression test for groupRunColumns --- .../src/components/EvalRunDetails/Table.tsx | 214 ++++++---- .../EvalRunDetails/etl/EtlColumnHeader.tsx | 74 ++++ .../etl/cellMaterializerContext.ts | 16 + .../etl/cells/EtlResolvedCell.tsx | 377 ++++++++++++++++++ .../etl/useCellMaterialization.ts | 268 +++++++++++++ .../EvalRunDetails/etl/useEtlColumns.tsx | 107 +++++ .../EvalRunDetails/etl/useHydrateScenarios.ts | 262 ++++++++++++ .../etl/useScopeChangeEviction.ts | 52 +++ .../etl/__tests__/groupRunColumns.test.ts | 179 +++++++++ .../src/evaluationRun/etl/index.ts | 3 + .../src/evaluationRun/etl/resolveMappings.ts | 75 ++++ 11 files changed, 1559 insertions(+), 68 deletions(-) create mode 100644 web/oss/src/components/EvalRunDetails/etl/EtlColumnHeader.tsx create mode 100644 web/oss/src/components/EvalRunDetails/etl/cellMaterializerContext.ts create mode 100644 web/oss/src/components/EvalRunDetails/etl/cells/EtlResolvedCell.tsx create mode 100644 web/oss/src/components/EvalRunDetails/etl/useCellMaterialization.ts create mode 100644 web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx create mode 100644 web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts create mode 100644 web/oss/src/components/EvalRunDetails/etl/useScopeChangeEviction.ts create mode 100644 web/packages/agenta-entities/src/evaluationRun/etl/__tests__/groupRunColumns.test.ts diff --git a/web/oss/src/components/EvalRunDetails/Table.tsx b/web/oss/src/components/EvalRunDetails/Table.tsx index bc5d6560e4..1cd8f70e1b 100644 --- a/web/oss/src/components/EvalRunDetails/Table.tsx +++ b/web/oss/src/components/EvalRunDetails/Table.tsx @@ -1,5 +1,6 @@ import {useCallback, useMemo, useRef} from "react" +import type {RunSchema} from "@agenta/entities/evaluationRun/etl" import {message} from "@agenta/ui/app-message" import clsx from "clsx" import {useAtomValue, useStore} from "jotai" @@ -19,11 +20,17 @@ import { import useComparisonPaginations from "../EvalRunDetails2/hooks/useComparisonPaginations" import {MAX_COMPARISON_RUNS, compareRunIdsAtom, getComparisonColor} from "./atoms/compare" +import {effectiveProjectIdAtom} from "./atoms/run" import {runDisplayNameAtomFamily} from "./atoms/runDerived" import type {EvaluationTableColumn} from "./atoms/table" -import {DEFAULT_SCENARIO_PAGE_SIZE} from "./atoms/table" +import {DEFAULT_SCENARIO_PAGE_SIZE, evaluationRunQueryAtomFamily} from "./atoms/table" import type {PreviewTableRow} from "./atoms/tableRows" import ScenarioColumnVisibilityPopoverContent from "./components/columnVisibility/ColumnVisibilityPopoverContent" +import {CellMaterializerContext} from "./etl/cellMaterializerContext" +import {useCellMaterialization} from "./etl/useCellMaterialization" +import {useEtlColumns} from "./etl/useEtlColumns" +import {useHydrateScenarios} from "./etl/useHydrateScenarios" +import {useScopeChangeEviction} from "./etl/useScopeChangeEviction" import { evaluationPreviewDatasetStore, evaluationPreviewTableStore, @@ -87,6 +94,75 @@ const EvalRunDetailsTable = ({ const previewColumns = usePreviewColumns({columnResult, evaluationType}) + // ── ETL schema columns + self-hydrating cells (Phase 1 — T2 + T3) ── + // The schema columns (testset / application / evaluator / metrics / + // other) are derived from the run graph and rendered by cells that + // resolve from molecule caches. `usePreviewColumns` / `columnResult` + // stay alive above for the CSV export path (keyed off `columnResult` + // ids) — only the *display* columns swap here. + const effectiveProjectId = useAtomValue(effectiveProjectIdAtom) + const projectId = _projectId ?? effectiveProjectId + + const runQuery = useAtomValue(useMemo(() => evaluationRunQueryAtomFamily(runId), [runId])) + const runSchema = useMemo(() => { + const data = runQuery.data?.rawRun?.data + const steps = data?.steps + const mappings = data?.mappings + if (!Array.isArray(steps) || !Array.isArray(mappings)) return null + return {steps, mappings} + }, [runQuery.data]) + + const etlColumns = useEtlColumns({projectId, runId, schema: runSchema}) + + // Page-level hydrate — predicate-aware (Phase 2). In Phase 1 (no + // predicate) this is inert; cells materialize their own visible data. + useHydrateScenarios({ + projectId, + runId, + rows: basePagination.rows, + schema: runSchema, + sliceMode: "auto", + }) + + // Cell-side lazy materializer — coalesces visible cells' slice + // requests into one bulk fetch per (slice, run). + const cellMaterializer = useCellMaterialization({projectId, runId}) + + // Evict molecule caches written for the outgoing run on scope change. + useScopeChangeEviction({projectId, runId}) + + // Final rendered column set: production meta columns (index / status, + // timestamp, action) and the column-visibility trigger are kept; the + // schema group columns are replaced by the ETL-derived ones. While the + // run schema is still loading, the production columns are used whole + // (their skeleton groups cover the gap). + const tableColumns = useMemo(() => { + const src = previewColumns.columns + if (!runSchema || etlColumns.length === 0) return src + const out: typeof src = [] + let inserted = false + for (const col of src) { + const children = (col as {children?: unknown[]}).children + const isGroup = Array.isArray(children) && children.length > 0 + if (isGroup) { + if (!inserted) { + out.push(...(etlColumns as typeof src)) + inserted = true + } + // drop the production schema group column + } else { + out.push(col) + } + } + if (!inserted) { + // No production group columns — insert ETL groups before the + // trailing column-visibility trigger. + const at = Math.max(out.length - 1, 0) + out.splice(at, 0, ...(etlColumns as typeof src)) + } + return out + }, [previewColumns.columns, etlColumns, runSchema]) + // Inject synthetic columns for comparison exports (do not render in UI) const exportColumns = useMemo(() => { const hasCompareRuns = compareSlots.some(Boolean) @@ -828,74 +904,76 @@ const EvalRunDetailsTable = ({ const hasCompareRuns = compareSlots.some(Boolean) return ( -
-
- - datasetStore={evaluationPreviewDatasetStore} - tableScope={tableScope} - store={store} - columns={previewColumns.columns} - rowKey={(record) => record.key} - tableClassName={clsx( - "agenta-scenario-table", - `agenta-scenario-table--row-${rowHeight}`, - )} - resizableColumns - useSettingsDropdown - settingsDropdownMenuItems={rowHeightMenuItems} - columnVisibilityMenuRenderer={( - controls, - close, - {scopeId, onExport, isExporting}, - ) => ( - - )} - pagination={paginationForShell} - exportOptions={exportOptions} - tableProps={{ - rowClassName: (record) => - clsx("scenario-row", { - "scenario-row--comparison": record.isComparisonRow, - }), - size: "small", - sticky: true, - virtual: true, - bordered: true, - tableLayout: "fixed", - onRow: (record) => { - const backgroundColor = hasCompareRuns - ? getComparisonColor( - typeof record.compareIndex === "number" - ? record.compareIndex - : 0, - ) - : "#fff" - - return { - onClick: (event) => { - const target = event.target as HTMLElement | null - if (target?.closest("[data-ivt-stop-row-click]")) return - handleRowClick(record as TableRowData) - }, - className: clsx({ - "comparison-row": record.isComparisonRow, + +
+
+ + datasetStore={evaluationPreviewDatasetStore} + tableScope={tableScope} + store={store} + columns={tableColumns} + rowKey={(record) => record.key} + tableClassName={clsx( + "agenta-scenario-table", + `agenta-scenario-table--row-${rowHeight}`, + )} + resizableColumns + useSettingsDropdown + settingsDropdownMenuItems={rowHeightMenuItems} + columnVisibilityMenuRenderer={( + controls, + close, + {scopeId, onExport, isExporting}, + ) => ( + + )} + pagination={paginationForShell} + exportOptions={exportOptions} + tableProps={{ + rowClassName: (record) => + clsx("scenario-row", { + "scenario-row--comparison": record.isComparisonRow, }), - style: backgroundColor ? {backgroundColor} : undefined, - } - }, - }} - /> -
- -
+ size: "small", + sticky: true, + virtual: true, + bordered: true, + tableLayout: "fixed", + onRow: (record) => { + const backgroundColor = hasCompareRuns + ? getComparisonColor( + typeof record.compareIndex === "number" + ? record.compareIndex + : 0, + ) + : "#fff" + + return { + onClick: (event) => { + const target = event.target as HTMLElement | null + if (target?.closest("[data-ivt-stop-row-click]")) return + handleRowClick(record as TableRowData) + }, + className: clsx({ + "comparison-row": record.isComparisonRow, + }), + style: backgroundColor ? {backgroundColor} : undefined, + } + }, + }} + /> +
+ +
+ ) } diff --git a/web/oss/src/components/EvalRunDetails/etl/EtlColumnHeader.tsx b/web/oss/src/components/EvalRunDetails/etl/EtlColumnHeader.tsx new file mode 100644 index 0000000000..d69faa29e0 --- /dev/null +++ b/web/oss/src/components/EvalRunDetails/etl/EtlColumnHeader.tsx @@ -0,0 +1,74 @@ +/** + * EtlColumnHeader + * + * Renders the nested-header label for a column group. The default + * `computeColumnGroup` resolver falls back to `Testset ` / + * `Application ` because it doesn't fetch the entity itself. + * + * This header is that override — same pattern production's + * `StepGroupHeader` uses: subscribe to the entity reference atom by ID + * and surface the entity's name when available, fall back to the slug + * otherwise. Evaluator + metrics + other groups already carry + * `slugToTitle`-rendered labels, so no entity lookup is needed. + */ + +import {useMemo} from "react" + +import type {ColumnGroup} from "@agenta/entities/evaluationRun/etl" +import {Tooltip} from "antd" +import {atom, useAtomValue} from "jotai" + +import { + applicationReferenceQueryAtomFamily, + testsetReferenceQueryAtomFamily, +} from "../atoms/references" + +const emptyAtom = atom<{data: {name?: string; slug?: string} | null} | null>(null) + +interface EtlColumnHeaderProps { + group: ColumnGroup +} + +const pickName = (entity: unknown): string | null => { + if (!entity || typeof entity !== "object") return null + const name = (entity as {name?: unknown}).name + return typeof name === "string" && name.length > 0 ? name : null +} + +const EtlColumnHeader = ({group}: EtlColumnHeaderProps) => { + const refAtom = useMemo(() => { + if (group.kind === "testset") { + const id = (group.refs?.testset as {id?: string} | undefined)?.id + return id ? testsetReferenceQueryAtomFamily(id) : emptyAtom + } + if (group.kind === "application") { + const id = (group.refs?.application as {id?: string} | undefined)?.id + return id ? applicationReferenceQueryAtomFamily(id) : emptyAtom + } + return emptyAtom + }, [group]) + + const ref = useAtomValue(refAtom) as {data?: unknown} | null + const name = pickName(ref?.data ?? null) + + const label = useMemo(() => { + switch (group.kind) { + case "testset": + return name ? `Testset ${name}` : group.label + case "application": + return name ? `Application ${name}` : group.label + default: + return group.label + } + }, [group.kind, group.label, name]) + + return ( + + + {label} + + + ) +} + +export default EtlColumnHeader diff --git a/web/oss/src/components/EvalRunDetails/etl/cellMaterializerContext.ts b/web/oss/src/components/EvalRunDetails/etl/cellMaterializerContext.ts new file mode 100644 index 0000000000..fa2c3fd900 --- /dev/null +++ b/web/oss/src/components/EvalRunDetails/etl/cellMaterializerContext.ts @@ -0,0 +1,16 @@ +/** + * One-line context shared between the table page (provider) and the cells + * (consumers). Cells call `materializer.request(slice, req)` when their + * column's data is missing from cache; the materializer coalesces + * concurrent same-tick requests into one bulk fetch per slice. + * + * Kept in its own file to avoid a circular import between + * `EtlResolvedCell` and the table (the cell imports the context type, + * the page sets the context value). + */ + +import {createContext} from "react" + +import type {CellMaterializer} from "./useCellMaterialization" + +export const CellMaterializerContext = createContext(null) diff --git a/web/oss/src/components/EvalRunDetails/etl/cells/EtlResolvedCell.tsx b/web/oss/src/components/EvalRunDetails/etl/cells/EtlResolvedCell.tsx new file mode 100644 index 0000000000..d023c89162 --- /dev/null +++ b/web/oss/src/components/EvalRunDetails/etl/cells/EtlResolvedCell.tsx @@ -0,0 +1,377 @@ +/** + * EtlResolvedCell — a single cell that resolves its value from molecule caches. + * + * Each cell: + * 1. Subscribes to TanStack cache entries for its scenario via `useQuery` + * with `enabled: false` — no network triggered from a cell render. + * The hydrate / materializer paths populate those entries. + * 2. Assembles a HydratedScenarioRow from the four entity slices + * (results / metrics / testcase / traces). + * 3. Runs `resolveMappings` against the hydrated row + run schema and + * picks out *just this cell's* column value. + * + * Non-terminal rendering — the load-bearing difference from the PoC. The + * PoC fabricated `scenario.status = "success"` because it only ran + * against finished runs. Production scenarios can be pending / running / + * failed / partial, so the cell renders four distinct states: + * + * value — resolved, render it. + * running — scenario not terminal: an in-progress indicator. NEVER a + * bare "—" (the user must tell "still computing" apart from + * "computed nothing"). + * loading — scenario terminal but this cell's slices not hydrated yet. + * missing — scenario terminal, slices hydrated, genuinely no value: "—". + */ + +import {useContext, useEffect, useMemo} from "react" + +import {evaluationResultMolecule, evaluationMetricMolecule} from "@agenta/entities/evaluationRun" +import { + resolveMappings, + unwrapStatsForCompare, + type RunSchema, + type ResolvedColumn, + type ColumnGroup, + type HydratedScenarioRow, + type HydratableScenario, +} from "@agenta/entities/evaluationRun/etl" +import {useQuery, useQueryClient} from "@tanstack/react-query" +import {Tag} from "antd" +import clsx from "clsx" +import {useAtomValue} from "jotai" + +import {isTerminalStatus} from "../../atoms/compare" +import {scenarioRowHeightAtom, type ScenarioRowHeight} from "../../state/rowHeight" +import {CellMaterializerContext} from "../cellMaterializerContext" +import {hydrationVersionAtom} from "../useHydrateScenarios" + +type ColumnKind = ColumnGroup["kind"] + +const MAX_LINES_BY_HEIGHT: Record = { + small: 4, + medium: 9, + large: 18, +} + +/** Entity slices each column kind reads from. */ +const SLICES_BY_KIND: Record = { + testset: ["results", "testcases"], + application: ["results", "traces"], + evaluator: ["results", "metrics"], + metrics: ["metrics"], + other: ["results"], +} + +export interface EtlResolvedCellProps { + projectId: string + runId: string + scenarioId: string + /** Real scenario status — drives the running / loading / missing split. */ + scenarioStatus: string + /** Column the cell should render — group kind + slug + column name. */ + columnKind: ColumnKind + columnGroupSlug: string | null + columnName: string + /** Run schema (steps + mappings). */ + schema: RunSchema | null +} + +const EtlResolvedCell = ({ + projectId, + runId, + scenarioId, + scenarioStatus, + columnKind, + columnGroupSlug, + columnName, + schema, +}: EtlResolvedCellProps) => { + const queryClient = useQueryClient() + const materializer = useContext(CellMaterializerContext) + // Bumped after each hydrate / materialize batch so cells re-render and + // pick up late-arriving testcase / trace cache writes. + const hydrationVersion = useAtomValue(hydrationVersionAtom) + const rowHeight = useAtomValue(scenarioRowHeightAtom) + const maxLines = MAX_LINES_BY_HEIGHT[rowHeight] + + // Pure subscriptions — `enabled: false` + no-op queryFn means a cell + // render never triggers network. The hydrate / materializer paths are + // the only writers; cells just observe. + const resultsQ = useQuery({ + queryKey: ["evaluation-results", projectId, runId, scenarioId], + queryFn: () => null, + enabled: false, + staleTime: Infinity, + }) + const metricsQ = useQuery({ + queryKey: ["evaluation-metrics", projectId, runId, scenarioId], + queryFn: () => null, + enabled: false, + staleTime: Infinity, + }) + + const resultsFetched = resultsQ.data !== undefined + const metricsFetched = metricsQ.data !== undefined + + // Resolve this cell's column from the molecule caches. + const resolved = useMemo(() => { + if (!schema) return null + + const results = (resultsQ.data ?? + evaluationResultMolecule.get.byScenario({projectId, runId, scenarioId}) ?? + []) as HydratedScenarioRow["results"] + const metrics = (metricsQ.data ?? + evaluationMetricMolecule.get.byScenario({projectId, runId, scenarioId}) ?? + []) as HydratedScenarioRow["metrics"] + + const testcaseIdCandidates = results + .map((r) => r.testcase_id) + .filter((v): v is string => typeof v === "string" && v.length > 0) + const testcaseId = testcaseIdCandidates[0] ?? null + const testcase = testcaseId + ? (queryClient.getQueryData([ + "testcase", + projectId, + testcaseId, + ]) ?? null) + : null + + const traces: Record = {} + for (const r of results) { + if (typeof r.trace_id === "string" && r.trace_id) { + const cached = queryClient.getQueryData([ + "trace-entity", + projectId, + r.trace_id, + ]) + if (cached != null) traces[r.trace_id] = cached + } + } + + const hydrated: HydratedScenarioRow = { + scenario: {id: scenarioId, status: scenarioStatus} as HydratableScenario, + results, + metrics, + testcase, + traces, + } + + const cols = resolveMappings(hydrated, { + steps: schema.steps, + mappings: schema.mappings, + }) + + return ( + cols.find((c) => { + if (c.name !== columnName) return false + if (c.group.kind !== columnKind) return false + if (columnGroupSlug != null && c.group.slug !== columnGroupSlug) return false + return true + }) ?? null + ) + }, [ + projectId, + runId, + scenarioId, + scenarioStatus, + columnKind, + columnGroupSlug, + columnName, + schema, + resultsQ.data, + metricsQ.data, + hydrationVersion, + queryClient, + ]) + + // Cell-side lazy materialization. Ask the page-level materializer to + // fill cache slices this cell needs; the materializer coalesces + // concurrent same-tick requests into one bulk fetch per (slice, run). + useEffect(() => { + if (!materializer || !projectId || !runId || !scenarioId) return + for (const slice of SLICES_BY_KIND[columnKind]) { + if (slice === "results") { + if (!evaluationResultMolecule.get.byScenario({projectId, runId, scenarioId})) { + materializer.request("results", {scenarioId, runId}) + } + } else if (slice === "metrics") { + if (!evaluationMetricMolecule.get.byScenario({projectId, runId, scenarioId})) { + materializer.request("metrics", {scenarioId, runId}) + } + } else if (slice === "testcases") { + const cachedResults = evaluationResultMolecule.get.byScenario({ + projectId, + runId, + scenarioId, + }) + const testcaseId = + cachedResults?.find((r) => typeof r.testcase_id === "string" && r.testcase_id) + ?.testcase_id ?? null + if (testcaseId) { + const cached = queryClient.getQueryData(["testcase", projectId, testcaseId]) + if (cached == null) materializer.request("testcases", {testcaseId}) + } + } else if (slice === "traces") { + const cachedResults = evaluationResultMolecule.get.byScenario({ + projectId, + runId, + scenarioId, + }) + const traceId = + cachedResults?.find((r) => typeof r.trace_id === "string" && r.trace_id) + ?.trace_id ?? null + if (traceId) { + const cached = queryClient.getQueryData(["trace-entity", projectId, traceId]) + if (cached == null) materializer.request("traces", {traceId}) + } + } + } + }, [materializer, projectId, runId, scenarioId, columnKind, hydrationVersion, queryClient]) + + const hasValue = !!resolved && resolved.source !== "missing" + + // Is a slice this cell needs still in flight? Distinguishes + // "slice-not-hydrated" (skeleton) from "genuinely missing" ("—") for + // a terminal scenario. A slice that the materializer marked failed is + // NOT counted as loading — otherwise a permanently rate-limited fetch + // would leave the cell on an infinite skeleton. + const sliceStillLoading = useMemo(() => { + for (const slice of SLICES_BY_KIND[columnKind]) { + if (slice === "results") { + if (!resultsFetched && !materializer?.hasFailed("results", {scenarioId, runId})) { + return true + } + } else if (slice === "metrics") { + if (!metricsFetched && !materializer?.hasFailed("metrics", {scenarioId, runId})) { + return true + } + } else if (slice === "testcases") { + // Needs results first — covered by the results check above. + if (!resultsFetched) continue + const cachedResults = evaluationResultMolecule.get.byScenario({ + projectId, + runId, + scenarioId, + }) + const testcaseId = + cachedResults?.find((r) => typeof r.testcase_id === "string" && r.testcase_id) + ?.testcase_id ?? null + if (!testcaseId) continue + const cached = queryClient.getQueryData(["testcase", projectId, testcaseId]) + if (cached === undefined && !materializer?.hasFailed("testcases", {testcaseId})) { + return true + } + } else if (slice === "traces") { + if (!resultsFetched) continue + const cachedResults = evaluationResultMolecule.get.byScenario({ + projectId, + runId, + scenarioId, + }) + const traceId = + cachedResults?.find((r) => typeof r.trace_id === "string" && r.trace_id) + ?.trace_id ?? null + if (!traceId) continue + const cached = queryClient.getQueryData(["trace-entity", projectId, traceId]) + if (cached === undefined && !materializer?.hasFailed("traces", {traceId})) { + return true + } + } + } + return false + }, [ + columnKind, + projectId, + runId, + scenarioId, + resultsFetched, + metricsFetched, + materializer, + queryClient, + hydrationVersion, + ]) + + const isTerminal = isTerminalStatus(scenarioStatus) + + let content: React.ReactNode + if (hasValue) { + content = ( +
+ {formatValue(unwrapStatsForCompare(resolved!.value))} +
+ ) + } else if (!isTerminal) { + // Scenario not finished — in-progress, NOT a missing value. + content = + } else if (sliceStillLoading) { + // Terminal scenario, this cell's slices not hydrated yet. + content =
+ } else { + // Terminal, hydrated, genuinely no value. + content = + } + + return
{content}
+} + +/** + * In-progress indicator for a non-terminal scenario's cell. A colored, + * pulsing dot + label — deliberately distinct from both the grey skeleton + * bar (data loading) and the "—" placeholder (no value). + */ +const RunningIndicator = ({status}: {status: string}) => { + const s = status.toLowerCase() + const dotClass = s === "running" ? "bg-blue-500" : "bg-amber-400" + const label = s === "running" ? "Running" : s === "queued" ? "Queued" : "Pending" + return ( + + + {label} + + ) +} + +/** + * Fixed-height placeholder for skeleton (not-yet-keyed) rows. Occupies + * the same `scenario-table-cell` box as a populated cell so the table + * doesn't jump when a skeleton row resolves to real data. + */ +export const EtlSkeletonCell = () => ( +
+
+
+) + +function formatValue(v: unknown): React.ReactNode { + if (v === null || v === undefined) { + return + } + if (typeof v === "boolean") { + return {String(v)} + } + if (typeof v === "number") { + return Number.isInteger(v) ? String(v) : v.toFixed(3) + } + // Cap at 800 chars as a DOM-size guard; the cell's CSS line-clamp does + // the visible truncation. + if (typeof v === "string") { + return v.length > 800 ? v.slice(0, 800) : v + } + try { + const json = JSON.stringify(v) + return json.length > 800 ? json.slice(0, 800) : json + } catch { + return String(v) + } +} + +export default EtlResolvedCell diff --git a/web/oss/src/components/EvalRunDetails/etl/useCellMaterialization.ts b/web/oss/src/components/EvalRunDetails/etl/useCellMaterialization.ts new file mode 100644 index 0000000000..d1b12e2d1b --- /dev/null +++ b/web/oss/src/components/EvalRunDetails/etl/useCellMaterialization.ts @@ -0,0 +1,268 @@ +/** + * useCellMaterialization — lazy, batched, run-aware cell-side prefetch. + * + * The page-level `useHydrateScenarios` only fetches entity slices the + * active predicate touches (Phase 2). In Phase 1 (no predicate) it fetches + * nothing, so every visible cell materializes itself. + * + * If 30 visible cells each call `molecule.actions.prefetchByScenarioIds( + * [scenarioId])` independently, the backend gets 30 round trips. To avoid + * that, this hook coalesces same-tick requests: + * + * 1. Cell asks for `(slice, {scenarioId, runId})` on first render. + * 2. Request is queued in a per-slice ref-set. + * 3. After a microtask flush, the hook drains every per-slice queue and + * issues ONE bulk prefetch per (slice, runId) with all requested IDs. + * 4. Cells re-render via `hydrationVersionAtom` once the writes land. + * + * Run-aware: results / metrics caches are run-scoped, so the queue is + * grouped by `runId` and one prefetch is issued per run. This is what + * lets comparison rows (which carry a different `runId` than the base + * run) hydrate correctly. + */ + +import {useEffect, useRef} from "react" + +import {evaluationResultMolecule, evaluationMetricMolecule} from "@agenta/entities/evaluationRun" +import type {EntitySlice} from "@agenta/entities/evaluationRun/etl" +import {testcaseMolecule} from "@agenta/entities/testcase" +import {traceSpanMolecule} from "@agenta/entities/trace" +import {getDefaultStore, useSetAtom} from "jotai" +import {queryClientAtom} from "jotai-tanstack-query" + +import {hydrationVersionAtom} from "./useHydrateScenarios" + +interface MaterializeRequest { + /** scenarioId — required for results / metrics. */ + scenarioId?: string + /** runId — required for results / metrics (run-scoped caches). */ + runId?: string + /** testcase_id — required for testcases. */ + testcaseId?: string + /** trace_id — required for traces. */ + traceId?: string +} + +interface BatchState { + /** Queued requests per slice. Drained on next microtask. */ + queues: Record + /** Per-slice "currently fetching" tracking keys so we don't double-fire. */ + inflightKeys: Record> + /** + * Per-slice "tried and got nothing back" tracking keys. The most + * common cause is HTTP 429 rate-limiting — the molecule's prefetch + * swallows the error and returns empty, leaving the cache empty. + * Without this set, the cell rerenders forever in a tight retry loop. + * Marked permanently for the session — user reloads to retry. + */ + failedKeys: Record> + /** True if a drain is already scheduled this tick. */ + scheduled: boolean +} + +const initialBatchState = (): BatchState => ({ + queues: {results: [], metrics: [], testcases: [], traces: []}, + inflightKeys: { + results: new Set(), + metrics: new Set(), + testcases: new Set(), + traces: new Set(), + }, + failedKeys: { + results: new Set(), + metrics: new Set(), + testcases: new Set(), + traces: new Set(), + }, + scheduled: false, +}) + +/** + * Stable tracking key for a (slice, request) pair. Results / metrics are + * run-scoped so the key includes `runId`; testcases / traces are keyed by + * their own id. Returns null when the request lacks the fields the slice + * needs. + */ +const trackingKey = (slice: EntitySlice, req: MaterializeRequest): string | null => { + if (slice === "results" || slice === "metrics") { + if (!req.runId || !req.scenarioId) return null + return `${req.runId}::${req.scenarioId}` + } + if (slice === "testcases") return req.testcaseId ?? null + if (slice === "traces") return req.traceId ?? null + return null +} + +interface UseCellMaterializationArgs { + projectId: string | null + /** Page (base) run id — used only to reset state on scope change. */ + runId: string | null +} + +export interface CellMaterializer { + /** + * Request materialization of (slice, request). The hook coalesces + * concurrent requests on the same microtask into one bulk fetch per + * (slice, runId). Safe to call repeatedly from a cell's render — + * duplicates are deduped. + */ + request: (slice: EntitySlice, req: MaterializeRequest) => void + /** + * True when a prior fetch for (slice, request) settled without + * populating the cache — most often a 429. Lets a cell stop showing a + * skeleton for a slice that will never arrive this session. + */ + hasFailed: (slice: EntitySlice, req: MaterializeRequest) => boolean +} + +const groupScenariosByRun = (reqs: MaterializeRequest[]): Map => { + const out = new Map() + for (const r of reqs) { + if (!r.runId || !r.scenarioId) continue + const arr = out.get(r.runId) ?? [] + if (!arr.includes(r.scenarioId)) arr.push(r.scenarioId) + out.set(r.runId, arr) + } + return out +} + +const dedupField = (reqs: MaterializeRequest[], field: "testcaseId" | "traceId"): string[] => { + const out = new Set() + for (const r of reqs) { + const v = r[field] + if (typeof v === "string" && v) out.add(v) + } + return Array.from(out) +} + +export const useCellMaterialization = ({ + projectId, + runId, +}: UseCellMaterializationArgs): CellMaterializer => { + const stateRef = useRef(initialBatchState()) + const bumpHydrationVersion = useSetAtom(hydrationVersionAtom) + + useEffect(() => { + // Reset on scope change. + stateRef.current = initialBatchState() + }, [projectId, runId]) + + const drain = async () => { + const state = stateRef.current + state.scheduled = false + if (!projectId) return + + // Snapshot + reset the queues — new requests can queue while + // we're fetching, those trigger their own drain. + const queues = state.queues + state.queues = {results: [], metrics: [], testcases: [], traces: []} + + const resultsByRun = groupScenariosByRun(queues.results) + const metricsByRun = groupScenariosByRun(queues.metrics) + const testcaseIds = dedupField(queues.testcases, "testcaseId") + const traceIds = dedupField(queues.traces, "traceId") + + // Mark in-flight before starting fetch so subsequent ticks dedupe. + for (const [run, ids] of resultsByRun) { + for (const id of ids) state.inflightKeys.results.add(`${run}::${id}`) + } + for (const [run, ids] of metricsByRun) { + for (const id of ids) state.inflightKeys.metrics.add(`${run}::${id}`) + } + for (const id of testcaseIds) state.inflightKeys.testcases.add(id) + for (const id of traceIds) state.inflightKeys.traces.add(id) + + const qc = getDefaultStore().get(queryClientAtom) + + // After a fetch settles, for each requested id check whether the + // cache now holds data. If not, the fetch failed silently (most + // often a 429) — mark it failed so request() skips it on future + // renders, avoiding an infinite request → 429 → retry loop. + const markRunFailures = ( + slice: "results" | "metrics", + run: string, + scenarioIds: string[], + ) => { + if (!qc) return + const prefix = slice === "results" ? "evaluation-results" : "evaluation-metrics" + for (const id of scenarioIds) { + const tk = `${run}::${id}` + state.inflightKeys[slice].delete(tk) + const cached = qc.getQueryData([prefix, projectId, run, id]) + if (cached === undefined) state.failedKeys[slice].add(tk) + } + } + const markIdFailures = (slice: "testcases" | "traces", ids: string[]) => { + if (!qc) return + const prefix = slice === "testcases" ? "testcase" : "trace-entity" + for (const id of ids) { + state.inflightKeys[slice].delete(id) + const cached = qc.getQueryData([prefix, projectId, id]) + if (cached === undefined) state.failedKeys[slice].add(id) + } + } + + const tasks: Promise[] = [] + for (const [run, scenarioIds] of resultsByRun) { + tasks.push( + evaluationResultMolecule.actions + .prefetchByScenarioIds({projectId, runId: run, scenarioIds}) + .finally(() => markRunFailures("results", run, scenarioIds)), + ) + } + for (const [run, scenarioIds] of metricsByRun) { + tasks.push( + evaluationMetricMolecule.actions + .prefetchByScenarioIds({projectId, runId: run, scenarioIds}) + .finally(() => markRunFailures("metrics", run, scenarioIds)), + ) + } + if (testcaseIds.length > 0) { + tasks.push( + testcaseMolecule.actions + .prefetchByIds({projectId, testcaseIds}) + .finally(() => markIdFailures("testcases", testcaseIds)), + ) + } + if (traceIds.length > 0) { + tasks.push( + traceSpanMolecule.actions + .prefetchByIds({projectId, traceIds}) + .finally(() => markIdFailures("traces", traceIds)), + ) + } + + try { + await Promise.all(tasks) + // Bump so cells re-render and pick up their newly-cached data. + if (tasks.length > 0) bumpHydrationVersion((v) => v + 1) + } catch (e) { + console.warn("[useCellMaterialization] batch failed:", e) + } + } + + const request: CellMaterializer["request"] = (slice, req) => { + const state = stateRef.current + const tk = trackingKey(slice, req) + if (!tk) return + // Skip if a previous drain for this key failed (most often a 429). + if (state.failedKeys[slice].has(tk)) return + // Skip if already being fetched by an earlier batch. + if (state.inflightKeys[slice].has(tk)) return + // Skip if a sibling cell already queued the same key this tick. + if (state.queues[slice].some((r) => trackingKey(slice, r) === tk)) return + state.queues[slice].push(req) + if (!state.scheduled) { + state.scheduled = true + queueMicrotask(drain) + } + } + + const hasFailed: CellMaterializer["hasFailed"] = (slice, req) => { + const tk = trackingKey(slice, req) + if (!tk) return false + return stateRef.current.failedKeys[slice].has(tk) + } + + return {request, hasFailed} +} diff --git a/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx b/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx new file mode 100644 index 0000000000..34890f348a --- /dev/null +++ b/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx @@ -0,0 +1,107 @@ +/** + * useEtlColumns + * + * Derives the scenario table's **schema columns** (testset / application / + * evaluator(s) / metrics / other) from a run's schema (steps + mappings) + * via `groupRunColumns`, and adapts them into nested-header IVT columns + * whose leaf cells mount `EtlResolvedCell`. + * + * This replaces the backend-metadata column path (`usePreviewColumns`) + * for the *rendered* schema columns. The meta columns (index / status, + * timestamp, action) and the column-visibility trigger stay on the + * production path — they are not schema-derived. The two are stitched + * together in `Table.tsx`. + * + * "other"-kind columns are kept (the PoC dropped them) so the visible + * column set matches the backend-metadata path. + */ + +import {useMemo} from "react" + +import {groupRunColumns, type ColumnGroup, type RunSchema} from "@agenta/entities/evaluationRun/etl" +import {Tooltip} from "antd" +import type {ColumnsType} from "antd/es/table" + +import type {PreviewTableRow} from "../atoms/tableRows" + +import EtlResolvedCell, {EtlSkeletonCell} from "./cells/EtlResolvedCell" +import EtlColumnHeader from "./EtlColumnHeader" + +const WIDTH_BY_KIND: Record = { + testset: 220, + application: 400, + evaluator: 180, + metrics: 140, + other: 180, +} + +export interface UseEtlColumnsArgs { + projectId: string | null + runId: string | null + schema: RunSchema | null +} + +/** + * Schema columns for the scenario table, as nested-header IVT columns. + * Empty until the run schema is available. + */ +export const useEtlColumns = ({ + projectId, + runId, + schema, +}: UseEtlColumnsArgs): ColumnsType => { + return useMemo>(() => { + if (!schema || !projectId || !runId) return [] + + const grouped = groupRunColumns(schema.steps, schema.mappings) + + return grouped.map((g) => { + const children = g.columns.map((leaf) => { + const key = `${g.group.key}::${leaf.name}` + return { + key, + columnVisibilityLabel: leaf.name, + title: ( + + + {leaf.name} + + + ), + width: WIDTH_BY_KIND[leaf.kind], + minWidth: WIDTH_BY_KIND[leaf.kind], + ellipsis: true, + align: "left" as const, + render: (_: unknown, record: PreviewTableRow) => { + // Skeleton / not-yet-keyed rows (incl. comparison + // placeholders) render a fixed-height placeholder. + if (record.__isSkeleton || !record.scenarioId) { + return + } + return ( + + ) + }, + } + }) + + return { + key: g.group.key, + columnVisibilityLabel: g.group.label, + title: , + align: "left" as const, + children, + } + }) + }, [projectId, runId, schema]) +} diff --git a/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts b/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts new file mode 100644 index 0000000000..b2e07a5dc6 --- /dev/null +++ b/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts @@ -0,0 +1,262 @@ +/** + * useHydrateScenarios + * + * Watches the scenario rows the table has loaded and triggers a bulk + * hydrate pass per *new* page. Mirrors the ETL strategy proved by the + * `EtlPocScenarios` PoC: bulk requests per page, all entities populated + * together. + * + * Flow per newly-seen scenario set: + * 1. evaluationResultMolecule.actions.prefetchByScenarioIds → results + * 2. evaluationMetricMolecule.actions.prefetchByScenarioIds → metrics + * 3. derive testcase_ids from results + * 4. prefetchTestcasesByIds(...) → testcases + * 5. derive trace_ids from results + * 6. prefetchTracesByIds(...) → traces + * + * Cache writes go through the molecules' `setQueryData` paths, so cells + * subscribing via `useQuery({queryKey: cacheKey, enabled: false})` see + * the data the moment it lands. + * + * Phase 1 note: with no active predicate and `sliceMode === "auto"` the + * page-level hydrate is intentionally a no-op — cells materialize their + * own (visible-only) data via `useCellMaterialization`. The hook is wired + * now so Phase 2 filtering can drive predicate-aware page hydration + * without a structural change. + */ + +import {useEffect, useMemo, useRef, useState} from "react" + +import {evaluationResultMolecule, evaluationMetricMolecule} from "@agenta/entities/evaluationRun" +import { + predicateToEntitySlices, + type EntitySlice, + type RowPredicate, + type RunSchema, +} from "@agenta/entities/evaluationRun/etl" +import {prefetchTestcasesByIds} from "@agenta/entities/testcase" +import {prefetchTracesByIds} from "@agenta/entities/trace" +import {atom, useSetAtom} from "jotai" + +const ALL_SLICES: EntitySlice[] = ["results", "metrics", "testcases", "traces"] + +/** + * Minimal row shape this hook reads — identity + skeleton flag. Kept + * structural (fields `unknown`) so it accepts both `PreviewTableRow[]` and + * the loosely-typed `InfiniteTableRowBase[]` the IVT pagination hook + * returns, without coupling to either. + */ +export interface HydratableRowRef { + scenarioId?: unknown + __isSkeleton?: unknown +} + +/** + * Hydration-version atom — bumped each time a hydrate / materialize batch + * completes. Cells subscribe to it so they re-render and pick up + * late-arriving testcase / trace cache writes (whose IDs aren't known + * until results land). Cheap: number atom, single React tick per batch. + */ +export const hydrationVersionAtom = atom(0) + +export interface HydrationProgress { + /** Total unique scenario IDs hydrated since mount. */ + hydratedScenarios: number + /** Pages observed (one bulk hydrate pass per page). */ + pagesHydrated: number + /** Which entity slices the next page load will fetch. */ + activeSlices: EntitySlice[] + /** Last error from any prefetch call, or null. */ + lastError: string | null + /** True while a hydrate pass is mid-flight. */ + isHydrating: boolean +} + +const INITIAL_PROGRESS: HydrationProgress = { + hydratedScenarios: 0, + pagesHydrated: 0, + activeSlices: ALL_SLICES, + lastError: null, + isHydrating: false, +} + +/** + * Slice-fetch strategy for the page-level hydrate. + * + * - "auto" (default): fetch only what's needed right now. With an active + * predicate that's the predicate's slice set; with no predicate that's + * zero slices — cells materialize their own data on first render + * (visible-only, virtualization-aware). + * - "all": always fetch all 4 slices. For workflows that need every + * column populated up-front (exports, bulk actions). + */ +export type SliceFetchMode = "auto" | "all" + +export interface UseHydrateScenariosArgs { + projectId: string | null + runId: string | null + rows: readonly HydratableRowRef[] + /** Run schema — maps an active predicate's column to entity slices. */ + schema?: RunSchema | null + /** Active predicate(s) — Phase 2 filtering. Null in Phase 1. */ + predicate?: RowPredicate | RowPredicate[] | null + /** Hydrate strategy — see `SliceFetchMode`. Default "auto". */ + sliceMode?: SliceFetchMode +} + +export const useHydrateScenarios = ({ + projectId, + runId, + rows, + schema = null, + predicate = null, + sliceMode = "auto", +}: UseHydrateScenariosArgs): HydrationProgress => { + const [progress, setProgress] = useState(INITIAL_PROGRESS) + const hydratedScenarioIdsRef = useRef>(new Set()) + const inflightRef = useRef | null>(null) + const bumpHydrationVersion = useSetAtom(hydrationVersionAtom) + + // Compute the slice set this hydrate pass should fetch. + const activeSlices = useMemo(() => { + if (sliceMode === "all") return ALL_SLICES + const result = predicateToEntitySlices(schema, predicate) + if (result.fallbackToAll) return ALL_SLICES + if (result.slices.size === 0) { + // No predicate active in auto mode → page-level hydrate is a + // no-op. Cells materialize what they need on first render. + return [] + } + // Always include results when testcases or traces are needed — + // those IDs live on result rows. + const slices = new Set(result.slices) + if (slices.has("testcases") || slices.has("traces")) slices.add("results") + return ALL_SLICES.filter((s) => slices.has(s)) + }, [schema, predicate, sliceMode]) + + const activeSlicesKey = activeSlices.join(",") + useEffect(() => { + hydratedScenarioIdsRef.current = new Set() + setProgress({...INITIAL_PROGRESS, activeSlices}) + }, [projectId, runId, activeSlicesKey]) + + useEffect(() => { + if (!projectId || !runId) return + // Only consider materialized (non-skeleton) scenarios with real IDs. + const candidateIds = rows + .filter( + (r) => + !r.__isSkeleton && typeof r.scenarioId === "string" && r.scenarioId.length > 0, + ) + .map((r) => r.scenarioId as string) + + const seen = hydratedScenarioIdsRef.current + const newIds = candidateIds.filter((id) => !seen.has(id)) + if (newIds.length === 0) return + + const slicesToFetch = new Set(activeSlices) + // Pure on-demand mode: nothing to fetch at the page level. Cells + // handle their own materialization via useCellMaterialization. + if (slicesToFetch.size === 0) { + for (const id of newIds) seen.add(id) + setProgress((p) => ({ + ...p, + hydratedScenarios: p.hydratedScenarios + newIds.length, + pagesHydrated: p.pagesHydrated + 1, + isHydrating: false, + lastError: null, + })) + return + } + + // Mark optimistically so a re-render mid-flight doesn't queue + // duplicate prefetch calls for the same scenarios. + for (const id of newIds) seen.add(id) + + const emptyOutcome = {cacheHits: 0, cacheMisses: 0, fetchMs: 0} + + const hydrateBatch = async () => { + setProgress((p) => ({...p, isHydrating: true, lastError: null})) + try { + // Stage 1 — results + metrics (parallel). + const [resultsOutcome] = await Promise.all([ + slicesToFetch.has("results") + ? evaluationResultMolecule.actions.prefetchByScenarioIds({ + projectId, + runId, + scenarioIds: newIds, + }) + : Promise.resolve({ + ...emptyOutcome, + results: [], + byScenarioId: new Map(), + }), + slicesToFetch.has("metrics") + ? evaluationMetricMolecule.actions.prefetchByScenarioIds({ + projectId, + runId, + scenarioIds: newIds, + }) + : Promise.resolve(null), + ]) + + // Stage 2 — derive testcase_ids + trace_ids from results. + const testcaseIds = new Set() + if (slicesToFetch.has("testcases")) { + for (const result of resultsOutcome.results) { + if (typeof result.testcase_id === "string" && result.testcase_id) { + testcaseIds.add(result.testcase_id) + } + } + } + + const traceIds = new Set() + if (slicesToFetch.has("traces")) { + for (const result of resultsOutcome.results) { + if (typeof result.trace_id === "string" && result.trace_id) { + traceIds.add(result.trace_id) + } + } + } + + await Promise.all([ + testcaseIds.size > 0 + ? prefetchTestcasesByIds({ + projectId, + testcaseIds: Array.from(testcaseIds), + }) + : Promise.resolve(emptyOutcome), + traceIds.size > 0 + ? prefetchTracesByIds({ + projectId, + traceIds: Array.from(traceIds), + }) + : Promise.resolve(emptyOutcome), + ]) + + setProgress((p) => ({ + hydratedScenarios: p.hydratedScenarios + newIds.length, + pagesHydrated: p.pagesHydrated + 1, + activeSlices, + lastError: null, + isHydrating: false, + })) + bumpHydrationVersion((v) => v + 1) + } catch (e) { + // On failure, un-mark so the next render can retry. + for (const id of newIds) seen.delete(id) + setProgress((p) => ({ + ...p, + lastError: e instanceof Error ? e.message : String(e), + isHydrating: false, + })) + } + } + + // Serialize hydrate calls — multiple page-loads in quick + // succession get queued, not parallel. + inflightRef.current = (inflightRef.current ?? Promise.resolve()).then(hydrateBatch) + }, [projectId, runId, rows, activeSlicesKey, activeSlices, bumpHydrationVersion]) + + return progress +} diff --git a/web/oss/src/components/EvalRunDetails/etl/useScopeChangeEviction.ts b/web/oss/src/components/EvalRunDetails/etl/useScopeChangeEviction.ts new file mode 100644 index 0000000000..70fcb520d5 --- /dev/null +++ b/web/oss/src/components/EvalRunDetails/etl/useScopeChangeEviction.ts @@ -0,0 +1,52 @@ +/** + * useScopeChangeEviction + * + * Evicts the molecule caches the ETL hydrate path wrote when the + * (projectId, runId) scope changes or the table unmounts. + * + * Triggers: + * - on dependency change (the *previous* scope's data gets evicted) + * - on unmount (component going away — release everything we wrote) + * + * What it evicts: + * - results + metrics → molecule.actions.evictByRunId (scoped to runId) + * - testcase + trace-entity + span → clearCacheByPrefix (run-agnostic) + * + * Atom families are intentionally NOT cleared: other views (focus drawer, + * observability tab) may subscribe to the same trace atoms. A + * `family.clear()` would yank their subscriptions too. + */ + +import {useEffect, useRef} from "react" + +import {evaluationResultMolecule, evaluationMetricMolecule} from "@agenta/entities/evaluationRun" +import {clearCacheByPrefix} from "@agenta/entities/evaluationRun/etl" + +export interface UseScopeChangeEvictionArgs { + projectId: string | null + runId: string | null +} + +export const useScopeChangeEviction = ({projectId, runId}: UseScopeChangeEvictionArgs): void => { + // Track the previous (projectId, runId) so the cleanup function evicts + // the *outgoing* scope, not the incoming one. + const prevRef = useRef<{projectId: string | null; runId: string | null}>({ + projectId: null, + runId: null, + }) + + useEffect(() => { + prevRef.current = {projectId, runId} + return () => { + const {projectId: pp, runId: rr} = prevRef.current + if (!pp || !rr) return + try { + evaluationResultMolecule.actions.evictByRunId({projectId: pp, runId: rr}) + evaluationMetricMolecule.actions.evictByRunId({projectId: pp, runId: rr}) + clearCacheByPrefix(["testcase", "trace-entity", "span"]) + } catch { + // QueryClient may already be torn down on app close — swallow. + } + } + }, [projectId, runId]) +} diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/groupRunColumns.test.ts b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/groupRunColumns.test.ts new file mode 100644 index 0000000000..aab45d27bd --- /dev/null +++ b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/groupRunColumns.test.ts @@ -0,0 +1,179 @@ +/** + * groupRunColumns — column-parity regression guard for the ETL scenario + * table migration (docs/designs/eval-scenarios-table-integration.md, T2). + * + * The backend-metadata column path (`usePreviewColumns`) and the run-graph + * column path (`useEtlColumns` → `groupRunColumns`) must surface the SAME + * visible column set. The most load-bearing part of that: the PoC's + * `useEtlColumns` dropped `group.kind === "other"` columns ("skip in the + * test page"). Production must keep them — dropping them silently shrinks + * the user-visible column set. These tests pin that down. + */ + +import assert from "node:assert/strict" +import {describe, it} from "node:test" + +import {groupRunColumns, type RunMapping, type RunStep} from "../resolveMappings" + +// A representative testset+app+evaluator run schema. auto / human / online +// runs all share this shape — the eval type only changes which metrics +// show and the scenario fetch order, neither of which `groupRunColumns` +// (a pure steps+mappings function) is aware of. +const STEPS: RunStep[] = [ + {key: "input", type: "input", references: {testset: {id: "ts1", slug: "my-testset"}}}, + { + key: "invocation", + type: "invocation", + references: {application: {id: "app1", slug: "my-app"}}, + }, + { + key: "eval-exact", + type: "annotation", + references: {evaluator: {id: "ev1", slug: "exact-match"}}, + }, +] + +const MAPPINGS: RunMapping[] = [ + {column: {kind: "input", name: "question"}, step: {key: "input", path: "data.question"}}, + { + column: {kind: "input", name: "ground_truth"}, + step: {key: "input", path: "data.ground_truth"}, + }, + { + column: {kind: "invocation", name: "output"}, + step: {key: "invocation", path: "attributes.ag.data.outputs"}, + }, + { + column: {kind: "annotation", name: "success"}, + step: {key: "eval-exact", path: "attributes.ag.data.outputs.success"}, + }, + // Metrics path overrides the step-type grouping → "metrics" group. + { + column: {kind: "metric", name: "Cost"}, + step: {key: "invocation", path: "attributes.ag.metrics.costs.cumulative.total"}, + }, +] + +describe("groupRunColumns — testset/app/evaluator/metrics", () => { + it("groups columns by source in stable order", () => { + const grouped = groupRunColumns(STEPS, MAPPINGS) + assert.deepEqual( + grouped.map((g) => g.group.kind), + ["testset", "application", "evaluator", "metrics"], + ) + }) + + it("keeps every mapped column — none dropped", () => { + const grouped = groupRunColumns(STEPS, MAPPINGS) + const total = grouped.reduce((n, g) => n + g.columns.length, 0) + assert.equal(total, MAPPINGS.length) + }) + + it("places multiple columns under their shared group", () => { + const grouped = groupRunColumns(STEPS, MAPPINGS) + const testset = grouped.find((g) => g.group.kind === "testset") + assert.ok(testset) + assert.deepEqual( + testset.columns.map((c) => c.name), + ["question", "ground_truth"], + ) + }) + + it("carries group kind + slug onto each leaf", () => { + const grouped = groupRunColumns(STEPS, MAPPINGS) + const evaluator = grouped.find((g) => g.group.kind === "evaluator") + assert.ok(evaluator) + assert.equal(evaluator.columns[0].name, "success") + assert.equal(evaluator.columns[0].kind, "evaluator") + assert.equal(evaluator.columns[0].groupSlug, "exact-match") + }) +}) + +describe("groupRunColumns — 'other' columns are INCLUDED (regression)", () => { + it("includes columns whose step has an unrecognised type", () => { + const steps: RunStep[] = [...STEPS, {key: "transform", type: "transform"}] + const mappings: RunMapping[] = [ + ...MAPPINGS, + { + column: {kind: "transform", name: "normalized"}, + step: {key: "transform", path: "data.normalized"}, + }, + ] + const grouped = groupRunColumns(steps, mappings) + const other = grouped.find((g) => g.group.kind === "other") + assert.ok(other, "the unrecognised-step column must produce an 'other' group") + assert.deepEqual( + other.columns.map((c) => c.name), + ["normalized"], + ) + // "other" sorts last. + assert.equal(grouped[grouped.length - 1].group.kind, "other") + }) + + it("includes columns whose mapping references a missing step", () => { + const mappings: RunMapping[] = [ + ...MAPPINGS, + {column: {kind: "meta", name: "orphan"}, step: {key: "does-not-exist", path: "x"}}, + ] + const grouped = groupRunColumns(STEPS, mappings) + const other = grouped.find((g) => g.group.kind === "other") + assert.ok(other, "a mapping with no resolvable step must produce an 'other' group") + assert.deepEqual( + other.columns.map((c) => c.name), + ["orphan"], + ) + }) + + it("the visible column count includes 'other' columns", () => { + const steps: RunStep[] = [...STEPS, {key: "transform", type: "transform"}] + const mappings: RunMapping[] = [ + ...MAPPINGS, + {column: {name: "normalized"}, step: {key: "transform", path: "p"}}, + ] + const grouped = groupRunColumns(steps, mappings) + const total = grouped.reduce((n, g) => n + g.columns.length, 0) + assert.equal(total, mappings.length) + }) +}) + +describe("groupRunColumns — edge cases", () => { + it("skips mappings with no column name", () => { + const mappings: RunMapping[] = [ + ...MAPPINGS, + {column: {kind: "input", name: ""}, step: {key: "input", path: "data.blank"}}, + {column: {kind: "input"}, step: {key: "input", path: "data.noname"}}, + ] + const grouped = groupRunColumns(STEPS, mappings) + const total = grouped.reduce((n, g) => n + g.columns.length, 0) + assert.equal(total, MAPPINGS.length) + }) + + it("returns an empty list for an empty schema", () => { + assert.deepEqual(groupRunColumns([], []), []) + }) + + it("disambiguates two evaluators emitting the same column name", () => { + const steps: RunStep[] = [ + ...STEPS, + { + key: "eval-judge", + type: "annotation", + references: {evaluator: {id: "ev2", slug: "llm-judge"}}, + }, + ] + const mappings: RunMapping[] = [ + ...MAPPINGS, + { + column: {kind: "annotation", name: "success"}, + step: {key: "eval-judge", path: "attributes.ag.data.outputs.success"}, + }, + ] + const grouped = groupRunColumns(steps, mappings) + const evaluators = grouped.filter((g) => g.group.kind === "evaluator") + assert.equal(evaluators.length, 2) + assert.deepEqual( + evaluators.map((g) => g.group.slug), + ["exact-match", "llm-judge"], + ) + }) +}) diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/index.ts b/web/packages/agenta-entities/src/evaluationRun/etl/index.ts index 505e3c5cb0..d1f5b6acfe 100644 --- a/web/packages/agenta-entities/src/evaluationRun/etl/index.ts +++ b/web/packages/agenta-entities/src/evaluationRun/etl/index.ts @@ -41,6 +41,8 @@ export type { ResolveMappingsOptions, ColumnGroup, ResolvedColumnGroup, + RunColumnLeaf, + RunColumnGroup, } from "./resolveMappings" export { DEFAULT_STEP_RESOLVERS, @@ -53,6 +55,7 @@ export { resolveMappings, computeColumnGroup, groupResolvedColumns, + groupRunColumns, } from "./resolveMappings" // Molecule-backed cache-aware fetchers — all 4 entity types go through diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/resolveMappings.ts b/web/packages/agenta-entities/src/evaluationRun/etl/resolveMappings.ts index c34ac1fb57..1cf67f5d92 100644 --- a/web/packages/agenta-entities/src/evaluationRun/etl/resolveMappings.ts +++ b/web/packages/agenta-entities/src/evaluationRun/etl/resolveMappings.ts @@ -637,3 +637,78 @@ export function groupResolvedColumns(columns: ResolvedColumn[]): ResolvedColumnG return (firstAppearance.get(a.group.key) ?? 0) - (firstAppearance.get(b.group.key) ?? 0) }) } + +// ============================================================================ +// Pre-resolution column grouping — group raw mappings by source. +// +// `groupResolvedColumns` above groups columns AFTER a row's values are +// resolved. `groupRunColumns` works directly off the run schema +// (steps + mappings), so the UI can build column headers before any +// scenario data is hydrated. +// ============================================================================ + +/** A single UI column leaf, before value resolution. */ +export interface RunColumnLeaf { + /** Column display name (from `mapping.column.name`). */ + name: string + /** Source category — testset / application / evaluator / metrics / other. */ + kind: ColumnGroup["kind"] + /** The owning group's slug (null for metrics and some "other" groups). */ + groupSlug: string | null +} + +/** A group of UI columns sharing a `ColumnGroup` — one nested header. */ +export interface RunColumnGroup { + group: ColumnGroup + columns: RunColumnLeaf[] +} + +/** + * Group a run's raw column mappings by source — testset / application / + * evaluator(s) / metrics / other. + * + * "other"-kind columns (steps with an unrecognised type, or mappings with + * no resolvable step) are **included**. They are real columns the + * backend-metadata column path also surfaces — dropping them would + * silently shrink the visible column set. + * + * Group order: testset → application → evaluator(s) → metrics → other. + * Within a kind, groups appear in the order their columns first appear in + * the mapping list (matching `groupResolvedColumns`). + */ +export function groupRunColumns(steps: RunStep[], mappings: RunMapping[]): RunColumnGroup[] { + const stepByKey = new Map() + for (const s of steps) stepByKey.set(s.key, s) + + const byKey = new Map() + const firstAppearance = new Map() + + mappings.forEach((mapping, idx) => { + const columnName = mapping.column?.name + if (typeof columnName !== "string" || !columnName) return + const step = mapping.step?.key ? (stepByKey.get(mapping.step.key) ?? null) : null + const path = mapping.step?.path ?? "" + const group = computeColumnGroup(step, path) + + let slot = byKey.get(group.key) + if (!slot) { + slot = {group, columns: []} + byKey.set(group.key, slot) + firstAppearance.set(group.key, idx) + } + slot.columns.push({name: columnName, kind: group.kind, groupSlug: group.slug}) + }) + + const kindOrder: Record = { + testset: 0, + application: 1, + evaluator: 2, + metrics: 3, + other: 4, + } + return Array.from(byKey.values()).sort((a, b) => { + const k = kindOrder[a.group.kind] - kindOrder[b.group.kind] + if (k !== 0) return k + return (firstAppearance.get(a.group.key) ?? 0) - (firstAppearance.get(b.group.key) ?? 0) + }) +} From 97245411cab137f6e5b7a22b9fc1e6cea7a08796 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 11:05:47 +0200 Subject: [PATCH 05/29] =?UTF-8?q?doc(eval):=20close=20T4=20filter=20compos?= =?UTF-8?q?ition=20=E2=80=94=20multi-predicate=20AND/OR=20(D8)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 filtering ships multi-condition AND/OR from day 1, not the PoC's single predicate. Records the decision, generalises the planned predicate type to a flat condition group, and marks T2+T3 as shipped. --- .../eval-scenarios-table-integration.md | 50 +++++++++++-------- 1 file changed, 28 insertions(+), 22 deletions(-) diff --git a/docs/designs/eval-scenarios-table-integration.md b/docs/designs/eval-scenarios-table-integration.md index 8d769872c2..e914d84424 100644 --- a/docs/designs/eval-scenarios-table-integration.md +++ b/docs/designs/eval-scenarios-table-integration.md @@ -56,11 +56,9 @@ molecule-cache pattern."* This is that project. | **D2 — comparison display** | Interleaved rows (today's model), not testcase-aligned columns. Compare-mode column set = shared testcase inputs + the **common-evaluator intersection** across compared runs + the standard invocation output. Reuses single-run column derivation. | | **D3 — live updates** | Match production's modest bar: run-status poll + page invalidation while non-terminal + human-eval metrics gap-fill. No real scenario streaming. | | **D5 — perf gate** | After Phase 1, benchmark the new table vs the current `useScenarioCellValue` table on a 1000+ scenario run with comparison on. A regression stops Phase 2. | +| **D8 — filter composition** | **Multi-predicate from day 1.** Phase 2 ships multi-condition AND/OR filtering, not the PoC's single predicate. The predicate type generalises to a condition *group* (`{op: "and" \| "or", conditions: RowPredicate[]}`); the filter bar reuses the observability multi-condition filter UI. Closed at Phase 2 start (was the one open decision). | -**Still open — closes before Phase 2:** - -- **Filter composition** — single predicate (PoC today) vs multi-condition - AND/OR. `eval-filtering.md` specs the fuller version. Decide at Phase 2 start. +**No open decisions.** --- @@ -131,14 +129,19 @@ current one on a 1000+ scenario run, comparison on. Regression → stop, rethink ## Phase 2 — filtering -- **T4 — filtering.** Decide filter composition first (see open decisions). - `filterSchema` derives filterable fields: columns → evaluator steps → - evaluator output schemas → typed fields + type-matched operators - (`eval-filtering.md` D4). The `filterTransform` evaluates the predicate per - row against hydrated metrics; the loop runs until the viewport fills. - **Reuse `withRateLimitRetry`** for the scan — a low-hit-ratio filter scans - many scenario + metric pages and EE throttling will 429 it (the batch-add - lesson). +- **T4 — multi-predicate filtering (D8).** Ships multi-condition AND/OR + from day 1 — not the PoC's single predicate. `filterSchema` derives + filterable fields: columns → evaluator steps → evaluator output schemas + → typed fields + type-matched operators (`eval-filtering.md` D4). The + predicate generalises from `RowPredicate` to a condition *group* + (`{op: "and" | "or", conditions: RowPredicate[]}`, one nesting level for + v1 — flat AND/OR, no arbitrary trees); `predicateToEntitySlices` takes + the union of every condition's slices. The `filterTransform` evaluates + the group per row against hydrated metrics; the loop runs until the + viewport fills. The filter bar reuses the observability multi-condition + filter UI. **Reuse `withRateLimitRetry`** for the scan — a low-hit-ratio + filter scans many scenario + metric pages and EE throttling will 429 it + (the batch-add lesson). --- @@ -193,8 +196,8 @@ design; these are the genuinely new design surfaces). blocking overlay. **Filter bar** — lives in the eval run details header row, following the -observability `Filters` placement. Single vs multi-predicate composition is the -open Phase 2 decision; if multi-predicate, reuse the observability filter UI. +observability `Filters` placement. Multi-predicate AND/OR composition (D8) — +reuses the observability multi-condition filter UI. ## Test plan @@ -206,8 +209,9 @@ T2 schema columns [GAP][CRITICAL][REGRESSION] resolveMappings column set "other" columns INCLUDED — before deleting the old path T3 cells [GAP] resolve from caches; pending/running/failed render [GAP] skeleton-while-pending: not-hydrated vs not-run -T4 filtering [GAP] filterSchema typed fields; filterTransform - match/no-match/pending; [→E2E] filter → rows +T4 filtering [GAP] filterSchema typed fields; multi-predicate + AND/OR filterTransform — match/no-match/pending + + group semantics; [→E2E] multi-condition → rows T5 comparison [GAP] compare-run schema fetch; testcase_id join; common-evaluator intersection; [→E2E] compare+filter T6 live updates [GAP] poll stops at terminal; page invalidation; gap-fill @@ -270,12 +274,12 @@ co-consumers. ## Implementation tasks **Phase 1 — column + cell swap** (T1 dropped — `evaluationPreviewTableStore` is already thin; see Phase 1) -- [ ] **T2 (P1, human: ~1d / CC: ~2h)** — schema columns for the **rendered** table; keep "other" columns (ripples into `ColumnLeaf`/`EtlResolvedCell`/`useCellMaterialization`); **column-parity regression test**. Keep `usePreviewColumns`/`columnResult` alive for the export path — full retirement is Phase 3 / T5. Lands together with T3 (coupled). -- [ ] **T3 (P1, human: ~3d / CC: ~half-day)** — self-hydrating cells **plus non-terminal scenario rendering + skeleton-while-pending** (the unbuilt part). Lands together with T2. -- [ ] **Perf gate (P1)** — benchmark vs the old table, 1000+ scenarios, comparison on. +- [x] **T2 (P1)** — schema columns for the **rendered** table; keeps "other" columns; **column-parity regression test** (`groupRunColumns.test.ts`). `usePreviewColumns`/`columnResult` kept alive for the export path. Landed with T3. +- [x] **T3 (P1)** — self-hydrating cells **plus non-terminal scenario rendering + skeleton-while-pending**. Landed with T2. +- [ ] **Perf gate (P1)** — benchmark vs the old table, 1000+ scenarios, comparison on. **Gates T4.** **Phase 2 — filtering** -- [ ] **T4 (P1, human: ~3d / CC: ~half-day)** — `filterSchema` + `filterTransform` + predicate UI + viewport-fill loop; reuse `withRateLimitRetry`. Close the composition decision first. +- [ ] **T4 (P1, human: ~3d / CC: ~half-day)** — `filterSchema` + **multi-predicate AND/OR** `filterTransform` + multi-condition predicate UI + viewport-fill loop; reuse `withRateLimitRetry`. Composition decided (D8). Gated on the Phase 1 perf gate. **Phase 3 — comparison, live, co-consumers** - [ ] **T5 (P1, human: ~3d / CC: ~half-day)** — comparison build: compare-run schema fetch + per-run hydration + testcase_id join + export-path migration. @@ -300,5 +304,7 @@ co-consumers. - **DESIGN DECISIONS:** focused review (migration preserves the visual design) · live hit-ratio counter for filter scanning · interaction-state specs added (skeleton, non-terminal cells, filter no-match empty state, rate-limited indicator). - **D6 (implementation-time finding):** starting Phase 1 confirmed `evaluationPreviewTableStore` is already a thin store (identity + status, no column data, per-eval-type order). **T1 is dropped** — Phase 1 is the coupled T2+T3 column+cell swap against the existing store. Confirms the eng-review outside voice's "T1 re-implements an existing store" point. - **D7 (implementation-time finding):** reading `Table.tsx` showed the CSV export path (`exportResolveValue`, `columnLookupMap`, `loadAllPagesBeforeExport`) is keyed off `columnResult` column ids, which differ from `useEtlColumns` keys. **Phase 1 swaps display columns only** and keeps `usePreviewColumns`/`columnResult` alive for export; the old column path fully retires in Phase 3 with the export migration (T5). The "other"-column un-drop ripples into `ColumnLeaf`, `EtlResolvedCell`, and `useCellMaterialization`. -- **UNRESOLVED:** 1 — filter composition (single vs multi-predicate) + its UI, intentionally deferred to Phase 2 start. Phase 1 has no open decisions. -- **VERDICT:** ENG + DESIGN REVIEW CLEARED — ready to implement Phase 1 (T2+T3). +- **D8 (Phase 2 decision):** filter composition resolved — **multi-predicate AND/OR from day 1**, not the PoC's single predicate. The predicate type generalises to a flat condition group; the filter bar reuses the observability multi-condition UI. +- **UNRESOLVED:** 0 — filter composition closed (D8). No open decisions. +- **STATUS:** Phase 1 (T2+T3) implemented and committed. Next: the D5 perf gate, which gates T4. +- **VERDICT:** ENG + DESIGN REVIEW CLEARED — Phase 1 (T2+T3) shipped; T4 is multi-predicate, gated on the perf gate. From 165c008270b9df3baf5f9089120381a59351cfa0 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 11:12:45 +0200 Subject: [PATCH 06/29] =?UTF-8?q?feat(entities):=20T4=20core=20=E2=80=94?= =?UTF-8?q?=20multi-predicate=20AND/OR=20filter=20logic=20+=20filterSchema?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-stages the Phase 2 (T4) filtering core — pure logic, decoupled from the D5 perf gate that gates T4 wiring. - PredicateGroup (flat AND/OR, one nesting level) + RowFilter type; evaluateRowPredicate / evaluatePredicateGroup / evaluateRowFilter / matchesRowFilter row-level evaluators; makePredicateGroupFilter pipeline transform. makeRowPredicateFilter left intact. - predicateToEntitySlices accepts a PredicateGroup — slice set is the union of every condition's slices (AND vs OR does not change the fetch). - buildFilterSchema: derives filterable fields + type-matched operators from the run schema, with a resolveValueType refinement seam. - 25 unit tests (predicateGroup, filterSchema). --- .../etl/__tests__/filterSchema.test.ts | 107 ++++++ .../etl/__tests__/predicateGroup.test.ts | 329 ++++++++++++++++++ .../src/evaluationRun/etl/filterSchema.ts | 133 +++++++ .../src/evaluationRun/etl/index.ts | 25 ++ .../etl/predicateToEntitySlices.ts | 17 +- .../evaluationRun/etl/rowPredicateFilter.ts | 135 ++++++- 6 files changed, 742 insertions(+), 4 deletions(-) create mode 100644 web/packages/agenta-entities/src/evaluationRun/etl/__tests__/filterSchema.test.ts create mode 100644 web/packages/agenta-entities/src/evaluationRun/etl/__tests__/predicateGroup.test.ts create mode 100644 web/packages/agenta-entities/src/evaluationRun/etl/filterSchema.ts diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/filterSchema.test.ts b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/filterSchema.test.ts new file mode 100644 index 0000000000..86bb7eed80 --- /dev/null +++ b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/filterSchema.test.ts @@ -0,0 +1,107 @@ +/** + * buildFilterSchema — derives the filterable fields the Phase 2 / T4 + * filter UI offers (decision D8). + * + * Covers field derivation, the schema-only value-type heuristic, the + * type-matched operator sets, the `resolveValueType` refinement seam, and + * deduplication. + */ + +import assert from "node:assert/strict" +import {describe, it} from "node:test" + +import {buildFilterSchema, operatorsForType} from "../filterSchema" +import type {RunSchema} from "../resolveMappings" + +const SCHEMA: RunSchema = { + steps: [ + {key: "in", type: "input", references: {testset: {id: "t1", slug: "ts"}}}, + {key: "ev", type: "annotation", references: {evaluator: {id: "e1", slug: "exact-match"}}}, + ], + mappings: [ + {column: {kind: "input", name: "question"}, step: {key: "in", path: "data.question"}}, + {column: {kind: "annotation", name: "success"}, step: {key: "ev", path: "out"}}, + // Metrics path overrides step-type grouping → "metrics" group. + { + column: {kind: "metric", name: "Cost"}, + step: {key: "ev", path: "attributes.ag.metrics.cost"}, + }, + ], +} + +describe("operatorsForType", () => { + it("number gets ordered comparisons", () => { + const ops = operatorsForType("number") + for (const op of ["lt", "lte", "gt", "gte"]) assert.ok(ops.includes(op as never)) + }) + + it("unknown / boolean withhold ordered comparisons", () => { + for (const op of ["lt", "gt"]) { + assert.equal(operatorsForType("unknown").includes(op as never), false) + assert.equal(operatorsForType("boolean").includes(op as never), false) + } + }) + + it("returns a fresh array (callers may mutate)", () => { + const a = operatorsForType("number") + a.pop() + assert.notEqual(a.length, operatorsForType("number").length) + }) +}) + +describe("buildFilterSchema", () => { + it("returns an empty schema for a null run schema", () => { + assert.deepEqual(buildFilterSchema(null), {fields: []}) + }) + + it("emits one field per mapped column", () => { + const {fields} = buildFilterSchema(SCHEMA) + assert.deepEqual(fields.map((f) => f.columnName).sort(), ["Cost", "question", "success"]) + }) + + it("types metrics columns as number, others as unknown", () => { + const {fields} = buildFilterSchema(SCHEMA) + const cost = fields.find((f) => f.columnName === "Cost") + const success = fields.find((f) => f.columnName === "success") + assert.equal(cost?.valueType, "number") + assert.equal(success?.valueType, "unknown") + assert.ok(cost?.operators.includes("gt")) + assert.equal(success?.operators.includes("gt"), false) + }) + + it("carries the targeting triple + labels", () => { + const {fields} = buildFilterSchema(SCHEMA) + const success = fields.find((f) => f.columnName === "success") + assert.equal(success?.groupKind, "evaluator") + assert.equal(success?.groupSlug, "exact-match") + assert.equal(success?.label, "success") + assert.ok(success?.groupLabel) + }) + + it("resolveValueType refines a field's type + operators", () => { + const {fields} = buildFilterSchema(SCHEMA, { + resolveValueType: (f) => (f.columnName === "success" ? "boolean" : undefined), + }) + const success = fields.find((f) => f.columnName === "success") + assert.equal(success?.valueType, "boolean") + assert.deepEqual(success?.operators, ["eq", "ne"]) + // Untouched fields keep the schema-only default. + assert.equal(fields.find((f) => f.columnName === "Cost")?.valueType, "number") + }) + + it("deduplicates identical (groupKind, groupSlug, columnName) triples", () => { + const dupSchema: RunSchema = { + steps: SCHEMA.steps, + mappings: [ + ...SCHEMA.mappings, + // Same column name + same step as an existing mapping. + { + column: {kind: "input", name: "question"}, + step: {key: "in", path: "data.question"}, + }, + ], + } + const {fields} = buildFilterSchema(dupSchema) + assert.equal(fields.filter((f) => f.columnName === "question").length, 1) + }) +}) diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/predicateGroup.test.ts b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/predicateGroup.test.ts new file mode 100644 index 0000000000..3c11d614c9 --- /dev/null +++ b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/predicateGroup.test.ts @@ -0,0 +1,329 @@ +/** + * Multi-predicate AND/OR filtering (Phase 2 / T4 — decision D8). + * + * Covers the pure predicate-evaluation core: single predicate, flat AND/OR + * groups, the `RowFilter` dispatch, the row-level `matchesRowFilter` + * convenience, the `makePredicateGroupFilter` pipeline transform, and the + * `predicateToEntitySlices` union for group inputs. + */ + +import assert from "node:assert/strict" +import {describe, it} from "node:test" + +import type {Chunk} from "../../../etl/core/types" +import type {HydratedScenarioRow} from "../hydrateScenariosTransform" +import {predicateToEntitySlices} from "../predicateToEntitySlices" +import type {ColumnGroup, ResolvedColumn, RunSchema} from "../resolveMappings" +import { + evaluatePredicateGroup, + evaluateRowFilter, + evaluateRowPredicate, + isPredicateGroup, + makePredicateGroupFilter, + matchesRowFilter, + type PredicateGroup, + type RowPredicate, +} from "../rowPredicateFilter" + +// A resolved column fixture — the shape `resolveMappings` emits. +function col(opts: { + name: string + kind: ColumnGroup["kind"] + slug?: string | null + value: unknown +}): ResolvedColumn { + const group: ColumnGroup = { + kind: opts.kind, + slug: opts.slug ?? null, + label: opts.kind, + key: `${opts.kind}:${opts.slug ?? "x"}`, + refs: null, + } + return { + name: opts.name, + kind: opts.kind, + stepKey: "step", + stepType: opts.kind, + path: "", + value: opts.value, + source: "metric", + group, + } +} + +const COLS: ResolvedColumn[] = [ + col({name: "success", kind: "evaluator", slug: "exact-match", value: true}), + col({name: "score", kind: "evaluator", slug: "llm-judge", value: 0.9}), + col({name: "country", kind: "testset", slug: "ts", value: "US"}), +] + +// ============================================================================= +// evaluateRowPredicate — one clause +// ============================================================================= + +describe("evaluateRowPredicate", () => { + it("eq / ne", () => { + assert.equal( + evaluateRowPredicate( + {groupKind: "evaluator", columnName: "success", op: "eq", value: true}, + COLS, + ), + true, + ) + assert.equal( + evaluateRowPredicate( + {groupKind: "evaluator", columnName: "success", op: "ne", value: true}, + COLS, + ), + false, + ) + }) + + it("numeric comparisons", () => { + const p = (op: RowPredicate["op"], value: number): RowPredicate => ({ + groupKind: "evaluator", + columnName: "score", + op, + value, + }) + assert.equal(evaluateRowPredicate(p("gt", 0.8), COLS), true) + assert.equal(evaluateRowPredicate(p("gte", 0.9), COLS), true) + assert.equal(evaluateRowPredicate(p("lt", 0.8), COLS), false) + assert.equal(evaluateRowPredicate(p("lte", 0.9), COLS), true) + }) + + it("in / nin", () => { + assert.equal( + evaluateRowPredicate( + {groupKind: "testset", columnName: "country", op: "in", value: ["US", "CA"]}, + COLS, + ), + true, + ) + assert.equal( + evaluateRowPredicate( + {groupKind: "testset", columnName: "country", op: "nin", value: ["US", "CA"]}, + COLS, + ), + false, + ) + }) + + it("narrows by groupSlug when set", () => { + // Same column name across two evaluators — slug disambiguates. + const cols = [ + col({name: "success", kind: "evaluator", slug: "a", value: true}), + col({name: "success", kind: "evaluator", slug: "b", value: false}), + ] + assert.equal( + evaluateRowPredicate( + { + groupKind: "evaluator", + groupSlug: "b", + columnName: "success", + op: "eq", + value: false, + }, + cols, + ), + true, + ) + }) + + it("a missing column fails eq but passes ne (compares against undefined)", () => { + const p = (op: RowPredicate["op"]): RowPredicate => ({ + groupKind: "evaluator", + columnName: "does-not-exist", + op, + value: true, + }) + assert.equal(evaluateRowPredicate(p("eq"), COLS), false) + assert.equal(evaluateRowPredicate(p("ne"), COLS), true) + }) + + it("unwraps a stats-blob value before comparing", () => { + const cols = [ + col({ + name: "success", + kind: "evaluator", + value: {type: "binary", freq: [{value: true, density: 1}]}, + }), + ] + assert.equal( + evaluateRowPredicate( + {groupKind: "evaluator", columnName: "success", op: "eq", value: true}, + cols, + ), + true, + ) + }) +}) + +// ============================================================================= +// evaluatePredicateGroup — flat AND / OR +// ============================================================================= + +describe("evaluatePredicateGroup", () => { + const pass: RowPredicate = { + groupKind: "evaluator", + columnName: "success", + op: "eq", + value: true, + } + const fail: RowPredicate = {groupKind: "evaluator", columnName: "score", op: "gt", value: 999} + + it("AND — every condition must match", () => { + assert.equal(evaluatePredicateGroup({op: "and", conditions: [pass, pass]}, COLS), true) + assert.equal(evaluatePredicateGroup({op: "and", conditions: [pass, fail]}, COLS), false) + }) + + it("OR — at least one condition must match", () => { + assert.equal(evaluatePredicateGroup({op: "or", conditions: [fail, pass]}, COLS), true) + assert.equal(evaluatePredicateGroup({op: "or", conditions: [fail, fail]}, COLS), false) + }) + + it("an empty group is no constraint — passes for both ops", () => { + assert.equal(evaluatePredicateGroup({op: "and", conditions: []}, COLS), true) + assert.equal(evaluatePredicateGroup({op: "or", conditions: []}, COLS), true) + }) +}) + +// ============================================================================= +// evaluateRowFilter — dispatch + isPredicateGroup +// ============================================================================= + +describe("evaluateRowFilter / isPredicateGroup", () => { + it("isPredicateGroup distinguishes a group from a single predicate", () => { + const single: RowPredicate = { + groupKind: "evaluator", + columnName: "success", + op: "eq", + value: true, + } + const group: PredicateGroup = {op: "and", conditions: [single]} + assert.equal(isPredicateGroup(single), false) + assert.equal(isPredicateGroup(group), true) + }) + + it("evaluates a single predicate or a group transparently", () => { + const single: RowPredicate = { + groupKind: "evaluator", + columnName: "success", + op: "eq", + value: true, + } + assert.equal(evaluateRowFilter(single, COLS), true) + assert.equal(evaluateRowFilter({op: "or", conditions: [single]}, COLS), true) + }) +}) + +// ============================================================================= +// matchesRowFilter — resolve schema, then evaluate +// ============================================================================= + +const ANNOTATION_SCHEMA: RunSchema = { + steps: [ + {key: "eval", type: "annotation", references: {evaluator: {id: "e1", slug: "exact-match"}}}, + ], + mappings: [{column: {kind: "annotation", name: "success"}, step: {key: "eval", path: "out"}}], +} + +function annotationRow(success: boolean): HydratedScenarioRow { + return { + scenario: {id: "s1", status: "success"}, + results: [], + // resolveFromMetric only reads `m.data` — a minimal metric is enough. + metrics: [{data: {eval: {out: success}}}] as unknown as HydratedScenarioRow["metrics"], + testcase: null, + traces: {}, + } +} + +describe("matchesRowFilter", () => { + it("resolves the run schema then evaluates the filter", () => { + const filter: PredicateGroup = { + op: "and", + conditions: [{groupKind: "evaluator", columnName: "success", op: "eq", value: true}], + } + assert.equal(matchesRowFilter(filter, ANNOTATION_SCHEMA, annotationRow(true)), true) + assert.equal(matchesRowFilter(filter, ANNOTATION_SCHEMA, annotationRow(false)), false) + }) +}) + +// ============================================================================= +// makePredicateGroupFilter — pipeline transform +// ============================================================================= + +describe("makePredicateGroupFilter", () => { + it("keeps only rows the filter matches", async () => { + const transform = makePredicateGroupFilter({ + filter: { + op: "or", + conditions: [ + {groupKind: "evaluator", columnName: "success", op: "eq", value: true}, + ], + }, + schema: ANNOTATION_SCHEMA, + }) + const chunk: Chunk = { + items: [annotationRow(true), annotationRow(false), annotationRow(true)], + cursor: null, + } + const out = await transform(chunk) + assert.equal(out.items.length, 2) + }) +}) + +// ============================================================================= +// predicateToEntitySlices — union across a group's conditions +// ============================================================================= + +const MIXED_SCHEMA: RunSchema = { + steps: [ + {key: "in", type: "input", references: {testset: {id: "t1", slug: "ts"}}}, + {key: "ev", type: "annotation", references: {evaluator: {id: "e1", slug: "em"}}}, + ], + mappings: [ + {column: {kind: "input", name: "question"}, step: {key: "in", path: "data.question"}}, + {column: {kind: "annotation", name: "success"}, step: {key: "ev", path: "out"}}, + ], +} + +describe("predicateToEntitySlices — group input", () => { + const testsetCond: RowPredicate = { + groupKind: "testset", + columnName: "question", + op: "eq", + value: "x", + } + const evaluatorCond: RowPredicate = { + groupKind: "evaluator", + columnName: "success", + op: "eq", + value: true, + } + + it("takes the union of every condition's slices", () => { + const group: PredicateGroup = {op: "and", conditions: [testsetCond, evaluatorCond]} + const {slices} = predicateToEntitySlices(MIXED_SCHEMA, group) + // testset → results + testcases; evaluator → results + metrics. + assert.deepEqual([...slices].sort(), ["metrics", "results", "testcases"]) + }) + + it("the boolean operator does not change the slice set", () => { + const and = predicateToEntitySlices(MIXED_SCHEMA, { + op: "and", + conditions: [testsetCond, evaluatorCond], + }) + const or = predicateToEntitySlices(MIXED_SCHEMA, { + op: "or", + conditions: [testsetCond, evaluatorCond], + }) + assert.deepEqual([...and.slices].sort(), [...or.slices].sort()) + }) + + it("an empty group needs no slices", () => { + const {slices} = predicateToEntitySlices(MIXED_SCHEMA, {op: "and", conditions: []}) + assert.equal(slices.size, 0) + }) +}) diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/filterSchema.ts b/web/packages/agenta-entities/src/evaluationRun/etl/filterSchema.ts new file mode 100644 index 0000000000..273063033e --- /dev/null +++ b/web/packages/agenta-entities/src/evaluationRun/etl/filterSchema.ts @@ -0,0 +1,133 @@ +/** + * filterSchema — derive the set of *filterable fields* for an evaluation + * run from its schema (steps + mappings). + * + * The filter UI (Phase 2 / T4) needs to know, before any scenario data is + * loaded: which columns can be filtered, what each one's value type is, + * and which operators that type allows. This module produces exactly + * that, keyed the same way `RowPredicate` targets a column + * (groupKind + groupSlug + columnName), so a UI selection maps straight + * onto a predicate. + * + * # Value typing + * + * The run schema does not carry per-column value types — those live in + * evaluator output schemas / sampled values, which this module does not + * fetch. So typing is best-effort: + * + * - metrics columns → "number" (cost / duration / tokens / scores) + * - everything else → "unknown" + * + * "unknown" still gets a safe equality-oriented operator set. Callers that + * *can* determine a precise type (e.g. T4 wiring with access to evaluator + * output schemas, or by sampling resolved values) pass `resolveValueType` + * to refine it — that is the intended extension seam, not an edit here. + * + * @packageDocumentation + */ + +import {groupRunColumns, type ColumnGroup, type RunSchema} from "./resolveMappings" +import type {RowPredicate} from "./rowPredicateFilter" + +/** Value type of a filterable field — drives the operator set. */ +export type FilterValueType = "string" | "number" | "boolean" | "unknown" + +/** All comparison operators a `RowPredicate` supports. */ +export type FilterOperator = RowPredicate["op"] + +/** A single field the user can filter on. */ +export interface FilterableField { + /** Targeting triple — maps directly onto `RowPredicate`. */ + groupKind: ColumnGroup["kind"] + groupSlug: string | null + columnName: string + /** Display label for the field (the column name). */ + label: string + /** Display label for the owning group (nested-header style). */ + groupLabel: string + /** Best-effort value type — "unknown" when undeterminable from the schema. */ + valueType: FilterValueType + /** Operators valid for this field's type. */ + operators: FilterOperator[] +} + +export interface FilterSchema { + fields: FilterableField[] +} + +const OPERATORS_BY_TYPE: Record = { + number: ["eq", "ne", "lt", "lte", "gt", "gte", "in", "nin"], + string: ["eq", "ne", "in", "nin"], + boolean: ["eq", "ne"], + // Undeterminable type — equality + membership are always safe; ordered + // comparisons are not, so they are withheld until the type is known. + unknown: ["eq", "ne", "in", "nin"], +} + +/** The operator set valid for a given value type. */ +export function operatorsForType(type: FilterValueType): FilterOperator[] { + return [...OPERATORS_BY_TYPE[type]] +} + +/** Schema-only default value type — metrics are numeric, the rest unknown. */ +function defaultValueType(kind: ColumnGroup["kind"]): FilterValueType { + return kind === "metrics" ? "number" : "unknown" +} + +export interface BuildFilterSchemaOptions { + /** + * Refine a field's value type. Return `undefined` to keep the + * schema-only default. This is the seam for type information that does + * not live in the run schema — evaluator output schemas, sampled + * resolved values, etc. + */ + resolveValueType?: (field: { + groupKind: ColumnGroup["kind"] + groupSlug: string | null + columnName: string + }) => FilterValueType | undefined +} + +/** + * Build the filterable-field schema for a run. Fields appear in the same + * group order the table renders columns (testset → application → + * evaluator → metrics → other). Duplicate (groupKind, groupSlug, + * columnName) triples are collapsed to one field. + */ +export function buildFilterSchema( + schema: RunSchema | null, + options: BuildFilterSchemaOptions = {}, +): FilterSchema { + if (!schema) return {fields: []} + + const groups = groupRunColumns(schema.steps, schema.mappings) + const fields: FilterableField[] = [] + const seen = new Set() + + for (const g of groups) { + for (const leaf of g.columns) { + const dedupKey = `${leaf.kind}::${leaf.groupSlug ?? ""}::${leaf.name}` + if (seen.has(dedupKey)) continue + seen.add(dedupKey) + + const hinted = options.resolveValueType?.({ + groupKind: leaf.kind, + groupSlug: leaf.groupSlug, + columnName: leaf.name, + }) + const valueType = hinted ?? defaultValueType(leaf.kind) + + fields.push({ + groupKind: leaf.kind, + groupSlug: leaf.groupSlug, + columnName: leaf.name, + label: leaf.name, + groupLabel: g.group.label, + valueType, + operators: operatorsForType(valueType), + }) + } + } + + return {fields} +} diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/index.ts b/web/packages/agenta-entities/src/evaluationRun/etl/index.ts index d1f5b6acfe..4bb71e0faf 100644 --- a/web/packages/agenta-entities/src/evaluationRun/etl/index.ts +++ b/web/packages/agenta-entities/src/evaluationRun/etl/index.ts @@ -94,13 +94,38 @@ export { // Per eval-filtering.md §D2: this is the v1 frontend transform over already- // loaded metric data. v2 server-side filter swaps the source's `filtering` // param and this transform becomes a no-op. +// +// Multi-predicate AND/OR composition (decision D8) — `PredicateGroup` plus +// the `evaluate*` / `matchesRowFilter` row-level entry points and the +// `makePredicateGroupFilter` pipeline transform. export { makeRowPredicateFilter, + makePredicateGroupFilter, unwrapStatsForCompare, + isPredicateGroup, + evaluateRowPredicate, + evaluatePredicateGroup, + evaluateRowFilter, + matchesRowFilter, type RowPredicate, + type PredicateGroup, + type RowFilter, type PredicateFilterOptions, + type PredicateGroupFilterOptions, } from "./rowPredicateFilter" +// filterSchema — derives the filterable fields (typed + type-matched +// operators) the Phase 2 filter UI offers. Decision D8 / eval-filtering D4. +export { + buildFilterSchema, + operatorsForType, + type FilterSchema, + type FilterableField, + type FilterValueType, + type FilterOperator, + type BuildFilterSchemaOptions, +} from "./filterSchema" + // Hit-ratio meter — v1→v2 escalation signal (reports the regime; doesn't // swap engines today). Per eval-filtering.md §D2 + §C3: tracks rolling // (matched/scanned) and recommends escalating to v2 when the ratio falls diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/predicateToEntitySlices.ts b/web/packages/agenta-entities/src/evaluationRun/etl/predicateToEntitySlices.ts index 51904c7c99..4a8bca4c6a 100644 --- a/web/packages/agenta-entities/src/evaluationRun/etl/predicateToEntitySlices.ts +++ b/web/packages/agenta-entities/src/evaluationRun/etl/predicateToEntitySlices.ts @@ -26,7 +26,8 @@ import type {ColumnGroup, RunMapping, RunSchema, RunStep} from "./resolveMappings" import {computeColumnGroup} from "./resolveMappings" -import type {RowPredicate} from "./rowPredicateFilter" +import type {PredicateGroup, RowPredicate} from "./rowPredicateFilter" +import {isPredicateGroup} from "./rowPredicateFilter" export type EntitySlice = "results" | "metrics" | "testcases" | "traces" @@ -132,18 +133,28 @@ function sliceForPredicate(schema: RunSchema, predicate: RowPredicate): EntitySl /** * Resolve the full set of slices needed across all active predicates. * + * Accepts a single predicate, a predicate array, or a `PredicateGroup` + * (flat AND/OR — decision D8). For a group the slice set is the **union** + * of every condition's slices: evaluating either an AND or an OR needs the + * data behind every condition, so the boolean operator does not change the + * fetch set. + * * Empty predicate set = no filter active = no predicate-driven fetch * required. Caller decides what to do (fetch all for display, or wait * for cells to materialize themselves). */ export function predicateToEntitySlices( schema: RunSchema | null, - predicates: RowPredicate | RowPredicate[] | null | undefined, + predicates: RowPredicate | RowPredicate[] | PredicateGroup | null | undefined, ): PredicateSliceResult { if (!schema || !predicates) { return {slices: new Set(), matchedColumns: [], fallbackToAll: false} } - const list = Array.isArray(predicates) ? predicates : [predicates] + const list: RowPredicate[] = Array.isArray(predicates) + ? predicates + : isPredicateGroup(predicates) + ? predicates.conditions + : [predicates] if (list.length === 0) { return {slices: new Set(), matchedColumns: [], fallbackToAll: false} } diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/rowPredicateFilter.ts b/web/packages/agenta-entities/src/evaluationRun/etl/rowPredicateFilter.ts index 8e8e712705..e36ddebed1 100644 --- a/web/packages/agenta-entities/src/evaluationRun/etl/rowPredicateFilter.ts +++ b/web/packages/agenta-entities/src/evaluationRun/etl/rowPredicateFilter.ts @@ -42,7 +42,12 @@ import type {Chunk, Transform} from "../../etl/core/types" import type {HydratedScenarioRow, HydratableScenario} from "./hydrateScenariosTransform" -import {resolveMappings, type ColumnGroup, type RunSchema} from "./resolveMappings" +import { + resolveMappings, + type ColumnGroup, + type ResolvedColumn, + type RunSchema, +} from "./resolveMappings" /** * One value-comparison clause against a single resolved column. @@ -189,3 +194,131 @@ export function makeRowPredicateFilter( return {...chunk, items: passing} } } + +// ============================================================================ +// Multi-predicate AND/OR composition (Phase 2 / T4 — decision D8) +// +// `RowPredicate` above is a single clause. A `PredicateGroup` joins several +// clauses with ONE boolean operator. v1 is intentionally FLAT — `conditions` +// are leaf predicates, not nested groups. That covers the real cases +// ("score > 0.8 AND exact_match == true", "country in ['US','CA']") without +// an arbitrary-tree filter UI. Nested groups can come later if needed. +// ============================================================================ + +/** + * A flat group of predicates joined by a single boolean operator. + * + * One nesting level only: `conditions` are leaf `RowPredicate`s. + * + * An **empty** group (`conditions: []`) is treated as *no constraint* — + * every row passes, regardless of `op`. An empty filter shows all rows. + */ +export interface PredicateGroup { + op: "and" | "or" + conditions: RowPredicate[] +} + +/** A filter is either a single predicate or a flat AND/OR group. */ +export type RowFilter = RowPredicate | PredicateGroup + +/** Narrow a `RowFilter` to a `PredicateGroup`. */ +export function isPredicateGroup(filter: RowFilter): filter is PredicateGroup { + return Array.isArray((filter as PredicateGroup).conditions) +} + +/** Find the resolved column a predicate targets (by kind + optional slug + name). */ +function findTargetColumn( + cols: ResolvedColumn[], + predicate: RowPredicate, +): ResolvedColumn | undefined { + return cols.find((c) => { + if (c.group.kind !== predicate.groupKind) return false + if (predicate.groupSlug != null && c.group.slug !== predicate.groupSlug) return false + return c.name === predicate.columnName + }) +} + +/** + * Evaluate one predicate against a row's already-resolved columns. + * + * A column the predicate references but the schema doesn't surface, or one + * that resolved to no value, compares against `undefined` — same semantics + * as `makeRowPredicateFilter` (so `eq`/`lt`/… fail, `ne` passes). + * + * This is *pure*: it does not know about hydration state. "Keep a row + * visible until its slices are hydrated" is a wiring-layer concern — the + * caller decides that before calling this. + */ +export function evaluateRowPredicate(predicate: RowPredicate, cols: ResolvedColumn[]): boolean { + const target = findTargetColumn(cols, predicate) + const actual = unwrapStatsForCompare(target?.value) + return compare(actual, predicate.op, predicate.value) +} + +/** + * Evaluate a flat AND/OR group against a row's resolved columns. + * + * - `op: "and"` — every condition must match. + * - `op: "or"` — at least one condition must match. + * - empty `conditions` — no constraint, the row passes. + */ +export function evaluatePredicateGroup(group: PredicateGroup, cols: ResolvedColumn[]): boolean { + if (group.conditions.length === 0) return true + return group.op === "and" + ? group.conditions.every((p) => evaluateRowPredicate(p, cols)) + : group.conditions.some((p) => evaluateRowPredicate(p, cols)) +} + +/** Evaluate any `RowFilter` (single predicate or AND/OR group) against resolved columns. */ +export function evaluateRowFilter(filter: RowFilter, cols: ResolvedColumn[]): boolean { + return isPredicateGroup(filter) + ? evaluatePredicateGroup(filter, cols) + : evaluateRowPredicate(filter, cols) +} + +/** + * Convenience: resolve a hydrated row's columns from the run schema, then + * evaluate the filter. This is the row-level entry point the scenario + * table's `filteredRows` derivation uses. + */ +export function matchesRowFilter( + filter: RowFilter, + schema: RunSchema, + row: HydratedScenarioRow, +): boolean { + return evaluateRowFilter(filter, resolveMappings(row, schema)) +} + +export interface PredicateGroupFilterOptions { + /** The active filter — a single predicate or a flat AND/OR group. */ + filter: RowFilter + /** Run schema (steps + mappings), used to resolve columns per row. */ + schema: RunSchema + /** Optional per-chunk telemetry callback. */ + onChunkFiltered?: (info: {chunk: number; scanned: number; matched: number}) => void +} + +/** + * Build a `Transform` that keeps only rows satisfying a `RowFilter` + * (single predicate or AND/OR group). The ETL-pipeline counterpart of the + * row-level `matchesRowFilter` — use this for headless / chunked runs. + */ +export function makePredicateGroupFilter( + options: PredicateGroupFilterOptions, +): Transform, HydratedScenarioRow> { + const {filter, schema} = options + let chunkIdx = 0 + + return async (chunk: Chunk>) => { + chunkIdx++ + const passing = chunk.items.filter((row) => + evaluateRowFilter(filter, resolveMappings(row, schema)), + ) + options.onChunkFiltered?.({ + chunk: chunkIdx, + scanned: chunk.items.length, + matched: passing.length, + }) + return {...chunk, items: passing} + } +} From 5d8cc7a6e9aaff962c29112faf469a9dbdc493f4 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 11:34:47 +0200 Subject: [PATCH 07/29] fix(oss): restore scenario table columns dropped by the ETL swap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two regressions from the Phase 1 ETL column swap — both because ETL columns derive from the run's raw mappings, not the curated backend column set: - `testcase_dedup_id` (any `*_dedup_id` column) was rendered. These are internal dedup keys, not user-facing — `groupRunColumns` now drops them, matching the backend-metadata column path. - the static invocation-metrics group (cost / duration / tokens) disappeared. `useEtlColumns` now skips metrics-kind groups and `Table.tsx` keeps the production metric group(s), rendered by the existing metric cell. --- .../src/components/EvalRunDetails/Table.tsx | 32 +++++++++++++++---- .../EvalRunDetails/etl/useEtlColumns.tsx | 11 ++++++- .../etl/__tests__/groupRunColumns.test.ts | 18 +++++++++++ .../src/evaluationRun/etl/resolveMappings.ts | 6 ++++ 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/web/oss/src/components/EvalRunDetails/Table.tsx b/web/oss/src/components/EvalRunDetails/Table.tsx index 1cd8f70e1b..bbf626f227 100644 --- a/web/oss/src/components/EvalRunDetails/Table.tsx +++ b/web/oss/src/components/EvalRunDetails/Table.tsx @@ -131,9 +131,24 @@ const EvalRunDetailsTable = ({ // Evict molecule caches written for the outgoing run on scope change. useScopeChangeEviction({projectId, runId}) + // Production metric-group ids. The scenario table's "Metrics" group is + // the static invocation metrics (cost / duration / tokens) — injected + // by the backend-metadata path, not run-mapping-derived — so it is kept + // as-is rather than replaced by ETL columns. + const metricGroupKeys = useMemo( + () => + new Set( + (columnResult?.groups ?? []) + .filter((g) => g.kind === "metric") + .map((g) => String(g.id)), + ), + [columnResult?.groups], + ) + // Final rendered column set: production meta columns (index / status, - // timestamp, action) and the column-visibility trigger are kept; the - // schema group columns are replaced by the ETL-derived ones. While the + // timestamp, action), the column-visibility trigger, and the static + // metric group(s) are kept; the testset / application / evaluator / + // other schema groups are replaced by the ETL-derived ones. While the // run schema is still loading, the production columns are used whole // (their skeleton groups cover the gap). const tableColumns = useMemo(() => { @@ -144,24 +159,27 @@ const EvalRunDetailsTable = ({ for (const col of src) { const children = (col as {children?: unknown[]}).children const isGroup = Array.isArray(children) && children.length > 0 - if (isGroup) { + const key = String((col as {key?: unknown}).key ?? "") + const isMetricGroup = isGroup && metricGroupKeys.has(key) + if (isGroup && !isMetricGroup) { if (!inserted) { out.push(...(etlColumns as typeof src)) inserted = true } - // drop the production schema group column + // drop the production schema group column (replaced by ETL) } else { + // keep: meta / visibility columns AND static metric groups out.push(col) } } if (!inserted) { - // No production group columns — insert ETL groups before the - // trailing column-visibility trigger. + // No replaceable production group columns — insert ETL groups + // before the trailing column-visibility trigger. const at = Math.max(out.length - 1, 0) out.splice(at, 0, ...(etlColumns as typeof src)) } return out - }, [previewColumns.columns, etlColumns, runSchema]) + }, [previewColumns.columns, etlColumns, runSchema, metricGroupKeys]) // Inject synthetic columns for comparison exports (do not render in UI) const exportColumns = useMemo(() => { diff --git a/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx b/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx index 34890f348a..9ab25483de 100644 --- a/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx @@ -53,7 +53,16 @@ export const useEtlColumns = ({ return useMemo>(() => { if (!schema || !projectId || !runId) return [] - const grouped = groupRunColumns(schema.steps, schema.mappings) + // "metrics"-kind columns are intentionally skipped here. The + // scenario table's "Metrics" group is the *static* invocation + // metrics (cost / duration / tokens) injected by the + // backend-metadata column path — not run-mapping-derived — so that + // group is kept on the production path in `Table.tsx` and rendered + // by the existing metric cell. Emitting an ETL metrics group too + // would duplicate it. + const grouped = groupRunColumns(schema.steps, schema.mappings).filter( + (g) => g.group.kind !== "metrics", + ) return grouped.map((g) => { const children = g.columns.map((leaf) => { diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/groupRunColumns.test.ts b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/groupRunColumns.test.ts index aab45d27bd..6026360173 100644 --- a/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/groupRunColumns.test.ts +++ b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/groupRunColumns.test.ts @@ -152,6 +152,24 @@ describe("groupRunColumns — edge cases", () => { assert.deepEqual(groupRunColumns([], []), []) }) + it("drops internal _dedup_id columns (regression)", () => { + const mappings: RunMapping[] = [ + ...MAPPINGS, + { + column: {kind: "input", name: "testcase_dedup_id"}, + step: {key: "input", path: "data.testcase_dedup_id"}, + }, + ] + const grouped = groupRunColumns(STEPS, mappings) + const names = grouped.flatMap((g) => g.columns.map((c) => c.name)) + assert.equal(names.includes("testcase_dedup_id"), false) + // The dedup column is excluded; every other mapped column is kept. + assert.equal( + grouped.reduce((n, g) => n + g.columns.length, 0), + MAPPINGS.length, + ) + }) + it("disambiguates two evaluators emitting the same column name", () => { const steps: RunStep[] = [ ...STEPS, diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/resolveMappings.ts b/web/packages/agenta-entities/src/evaluationRun/etl/resolveMappings.ts index 1cf67f5d92..f2d9be9f86 100644 --- a/web/packages/agenta-entities/src/evaluationRun/etl/resolveMappings.ts +++ b/web/packages/agenta-entities/src/evaluationRun/etl/resolveMappings.ts @@ -672,6 +672,10 @@ export interface RunColumnGroup { * backend-metadata column path also surfaces — dropping them would * silently shrink the visible column set. * + * Internal dedup keys (column names containing `_dedup_id`, e.g. + * `testcase_dedup_id`) are **excluded** — they are not user-facing + * columns. The backend-metadata column path drops them too. + * * Group order: testset → application → evaluator(s) → metrics → other. * Within a kind, groups appear in the order their columns first appear in * the mapping list (matching `groupResolvedColumns`). @@ -686,6 +690,8 @@ export function groupRunColumns(steps: RunStep[], mappings: RunMapping[]): RunCo mappings.forEach((mapping, idx) => { const columnName = mapping.column?.name if (typeof columnName !== "string" || !columnName) return + // Internal dedup keys are not user-facing columns. + if (columnName.includes("_dedup_id")) return const step = mapping.step?.key ? (stepByKey.get(mapping.step.key) ?? null) : null const path = mapping.step?.path ?? "" const group = computeColumnGroup(step, path) From 33efa7eaf406c9bf2bc84b7973895e69c8c0f570 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 11:51:32 +0200 Subject: [PATCH 08/29] feat(oss): multi-predicate filtering in the eval scenarios table (T4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wires Phase 2 filtering into the real scenarios table — multi-condition AND/OR (decision D8), not the /etl-poc page. - ScenarioFilterBar: a filter bar in the table header — column / operator / value per condition, AND/OR join, add / remove / clear. Columns and type-matched operators come from buildFilterSchema; a name heuristic refines value types for the common cases. - scenarioFilterState: per-run PredicateGroup atom; half-built conditions are dropped from the evaluated filter. - useScenarioFilter: filters the base rows via evaluateRowFilter against molecule-cache-resolved columns, with a viewport-fill loop so a strict filter still fills the viewport; unhydrated rows stay visible until known; the confirmed-match count gates the loop. - Table.tsx: feeds filtered base rows into the merge, drives predicate- aware hydration, remounts the table on filter change, guards cell renders against the transient undefined record. --- .../src/components/EvalRunDetails/Table.tsx | 58 +++- .../EvalRunDetails/etl/ScenarioFilterBar.tsx | 279 ++++++++++++++++++ .../EvalRunDetails/etl/scenarioFilterState.ts | 37 +++ .../EvalRunDetails/etl/useEtlColumns.tsx | 5 + .../EvalRunDetails/etl/useHydrateScenarios.ts | 9 +- .../EvalRunDetails/etl/useScenarioFilter.ts | 202 +++++++++++++ 6 files changed, 580 insertions(+), 10 deletions(-) create mode 100644 web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx create mode 100644 web/oss/src/components/EvalRunDetails/etl/scenarioFilterState.ts create mode 100644 web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts diff --git a/web/oss/src/components/EvalRunDetails/Table.tsx b/web/oss/src/components/EvalRunDetails/Table.tsx index bbf626f227..e4e925d3ad 100644 --- a/web/oss/src/components/EvalRunDetails/Table.tsx +++ b/web/oss/src/components/EvalRunDetails/Table.tsx @@ -27,9 +27,11 @@ import {DEFAULT_SCENARIO_PAGE_SIZE, evaluationRunQueryAtomFamily} from "./atoms/ import type {PreviewTableRow} from "./atoms/tableRows" import ScenarioColumnVisibilityPopoverContent from "./components/columnVisibility/ColumnVisibilityPopoverContent" import {CellMaterializerContext} from "./etl/cellMaterializerContext" +import ScenarioFilterBar from "./etl/ScenarioFilterBar" import {useCellMaterialization} from "./etl/useCellMaterialization" import {useEtlColumns} from "./etl/useEtlColumns" import {useHydrateScenarios} from "./etl/useHydrateScenarios" +import {useScenarioFilter} from "./etl/useScenarioFilter" import {useScopeChangeEviction} from "./etl/useScopeChangeEviction" import { evaluationPreviewDatasetStore, @@ -47,6 +49,10 @@ import {patchFocusDrawerQueryParams} from "./state/urlFocusDrawer" type TableRowData = PreviewTableRow +// Stable empty reference for the table's rows while a filter is still +// scanning with zero confirmed matches — avoids per-page flicker. +const EMPTY_MERGED_ROWS: TableRowData[] = [] + interface EvalRunDetailsTableProps { runId: string evaluationType: "auto" | "human" | "online" @@ -112,15 +118,29 @@ const EvalRunDetailsTable = ({ return {steps, mappings} }, [runQuery.data]) + // Phase 2 — multi-predicate filtering (D8). Filters the base run's + // rows; each comparison group follows its base row. + const {filteredBaseRows, effectiveFilter, isScanning} = useScenarioFilter({ + projectId, + runId, + schema: runSchema, + baseRows: basePagination.rows, + loadNextPage: basePagination.loadNextPage, + hasMore: basePagination.paginationInfo.hasMore, + isFetching: basePagination.paginationInfo.isFetching, + }) + const etlColumns = useEtlColumns({projectId, runId, schema: runSchema}) - // Page-level hydrate — predicate-aware (Phase 2). In Phase 1 (no - // predicate) this is inert; cells materialize their own visible data. + // Page-level hydrate — predicate-aware: with an active filter it + // fetches the entity slices the filter needs to be evaluated; with no + // filter it is inert and cells materialize their own visible data. useHydrateScenarios({ projectId, runId, rows: basePagination.rows, schema: runSchema, + predicate: effectiveFilter, sliceMode: "auto", }) @@ -219,7 +239,7 @@ const EvalRunDetailsTable = ({ const mergedRows = useMemo(() => { if (!compareSlots.some(Boolean)) { - return basePagination.rows.map((row) => ({ + return filteredBaseRows.map((row) => ({ ...row, baseScenarioId: row.scenarioId ?? row.id, compareIndex: 0, @@ -227,7 +247,7 @@ const EvalRunDetailsTable = ({ })) } - const baseRows = basePagination.rows.map((row) => ({ + const baseRows = filteredBaseRows.map((row) => ({ ...row, baseScenarioId: row.scenarioId ?? row.id, compareIndex: 0, @@ -309,7 +329,7 @@ const EvalRunDetailsTable = ({ }) return result - }, [basePagination.rows, compareSlots, compareRowsBySlot]) + }, [filteredBaseRows, compareSlots, compareRowsBySlot]) const handleRowClick = useCallback( (record: TableRowData) => { @@ -370,11 +390,14 @@ const EvalRunDetailsTable = ({ const paginationForShell = useMemo>( () => ({ - rows: mergedRows, + // While a filter is still scanning with zero confirmed + // matches, hand the table a stable empty array so it shows one + // steady loading state instead of per-page row flicker. + rows: isScanning ? EMPTY_MERGED_ROWS : mergedRows, loadNextPage: handleLoadMore, resetPages: handleResetPages, }), - [handleLoadMore, handleResetPages, mergedRows], + [handleLoadMore, handleResetPages, mergedRows, isScanning], ) // Build group map for export label resolution @@ -924,13 +947,27 @@ const EvalRunDetailsTable = ({ return (
+
+ /* + * Remount on filter change. Applying a filter + * shrinks the row set sharply; remounting resets + * the virtual render window to the new length so + * its row renderer can't index past the end. + * Column visibility survives (localStorage-backed). + */ + key={`scenario-table-${runId}-${JSON.stringify(effectiveFilter)}`} datasetStore={evaluationPreviewDatasetStore} tableScope={tableScope} store={store} columns={tableColumns} - rowKey={(record) => record.key} + /* + * Defensive rowKey — antd's virtual table can hand + * this an out-of-range `undefined` record for a + * frame while the filtered row set is shrinking. + */ + rowKey={(record, index) => record?.key ?? `__phantom_${index ?? 0}`} tableClassName={clsx( "agenta-scenario-table", `agenta-scenario-table--row-${rowHeight}`, @@ -965,6 +1002,11 @@ const EvalRunDetailsTable = ({ virtual: true, bordered: true, tableLayout: "fixed", + // One steady overlay while a filter scans for + // its first match — replaces per-page flicker. + loading: isScanning + ? {spinning: true, tip: "Scanning for matches…"} + : false, onRow: (record) => { const backgroundColor = hasCompareRuns ? getComparisonColor( diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx new file mode 100644 index 0000000000..65a45928e0 --- /dev/null +++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx @@ -0,0 +1,279 @@ +/** + * ScenarioFilterBar — multi-condition AND/OR filter for the evaluation + * run scenarios table (decision D8). + * + * Columns come from `buildFilterSchema` (run graph). Each condition is a + * (column, operator, value) clause; conditions are joined by a single + * AND/OR operator. The bar writes a `PredicateGroup` to the per-run + * filter atom — `useScenarioFilter` reads it and filters the rows. + * + * Value typing is best-effort: the run schema does not carry column + * types, so a name heuristic refines `buildFilterSchema`'s default + * (numeric for metrics, unknown otherwise) for the common cases. + */ + +import {useCallback, useMemo} from "react" + +import { + buildFilterSchema, + type FilterOperator, + type FilterValueType, + type RowPredicate, + type RunSchema, +} from "@agenta/entities/evaluationRun/etl" +import {Button, Input, InputNumber, Segmented, Select, Tooltip} from "antd" +import {useAtom} from "jotai" +import {Plus, X} from "lucide-react" + +import {scenarioFilterAtomFamily, isScenarioFilterActive} from "./scenarioFilterState" + +const OP_LABELS: Record = { + eq: "equals", + ne: "not equals", + lt: "<", + lte: "≤", + gt: ">", + gte: "≥", + in: "in", + nin: "not in", +} + +// Operators offered in the UI. `in` / `nin` are supported by the filter +// engine but need an array-value input — deferred from this v1 bar. +const UI_OPERATORS: FilterOperator[] = ["eq", "ne", "lt", "lte", "gt", "gte"] + +const NUMERIC_HINTS = [ + "score", + "cost", + "latency", + "duration", + "token", + "count", + "rate", + "ratio", + "total", + "avg", + "mean", +] +const BOOLEAN_HINTS = ["success", "passed", "failed", "correct", "valid", "match", "is_", "has_"] + +/** Name-based value-type heuristic — refines the schema-only default. */ +function heuristicValueType(field: {columnName: string}): FilterValueType | undefined { + const n = field.columnName.toLowerCase() + if (NUMERIC_HINTS.some((h) => n.includes(h))) return "number" + if (BOOLEAN_HINTS.some((h) => n.includes(h))) return "boolean" + return undefined +} + +const encodeField = (f: {groupKind: string; groupSlug?: string | null; columnName: string}) => + `${f.groupKind}|${f.groupSlug ?? ""}|${f.columnName}` + +const blankCondition = (): RowPredicate => ({ + groupKind: "evaluator", + groupSlug: null, + columnName: "", + op: "eq", + value: "", +}) + +export interface ScenarioFilterBarProps { + runId: string + schema: RunSchema | null +} + +const ScenarioFilterBar = ({runId, schema}: ScenarioFilterBarProps) => { + const [filter, setFilter] = useAtom(scenarioFilterAtomFamily(runId)) + + const fields = useMemo( + () => buildFilterSchema(schema, {resolveValueType: heuristicValueType}).fields, + [schema], + ) + const fieldByKey = useMemo(() => new Map(fields.map((f) => [encodeField(f), f])), [fields]) + const fieldOptions = useMemo( + () => + fields.map((f) => ({ + value: encodeField(f), + label: `${f.groupLabel} · ${f.label}`, + })), + [fields], + ) + + const conditions = filter.conditions + + const setConditions = useCallback( + (next: RowPredicate[]) => setFilter((prev) => ({...prev, conditions: next})), + [setFilter], + ) + const updateCondition = useCallback( + (index: number, partial: Partial) => + setConditions(conditions.map((c, i) => (i === index ? {...c, ...partial} : c))), + [conditions, setConditions], + ) + const removeCondition = useCallback( + (index: number) => setConditions(conditions.filter((_, i) => i !== index)), + [conditions, setConditions], + ) + + // Run graph carries no filterable columns — hide the bar entirely. + if (fields.length === 0) return null + + const active = isScenarioFilterActive(filter) + + return ( +
+ Filters + + {conditions.length >= 2 && ( + + size="small" + value={filter.op} + options={[ + {label: "AND", value: "and"}, + {label: "OR", value: "or"}, + ]} + onChange={(op) => setFilter((prev) => ({...prev, op}))} + /> + )} + + {conditions.map((condition, index) => { + const fieldKey = condition.columnName ? encodeField(condition) : undefined + const field = fieldKey ? fieldByKey.get(fieldKey) : undefined + const valueType: FilterValueType = field?.valueType ?? "unknown" + const ops = field + ? UI_OPERATORS.filter((o) => field.operators.includes(o)) + : UI_OPERATORS + + return ( +
+ {index > 0 && ( + {filter.op} + )} + + size="small" + placeholder="Column" + style={{minWidth: 200}} + showSearch + optionFilterProp="label" + value={fieldKey} + options={fieldOptions} + onChange={(value) => { + const picked = fieldByKey.get(value) + if (!picked) return + const nextOps = UI_OPERATORS.filter((o) => + picked.operators.includes(o), + ) + updateCondition(index, { + groupKind: picked.groupKind, + groupSlug: picked.groupSlug, + columnName: picked.columnName, + op: nextOps[0] ?? "eq", + value: picked.valueType === "boolean" ? true : "", + }) + }} + /> + + size="small" + style={{minWidth: 104}} + value={condition.op} + disabled={!field} + options={ops.map((o) => ({value: o, label: OP_LABELS[o]}))} + onChange={(op) => updateCondition(index, {op})} + /> + updateCondition(index, {value})} + /> + +
+ ) + })} + + + + {active && ( + + )} +
+ ) +} + +/** Value input — shape depends on the field's (best-effort) value type. */ +const ConditionValueInput = ({ + valueType, + value, + disabled, + onChange, +}: { + valueType: FilterValueType + value: unknown + disabled: boolean + onChange: (value: unknown) => void +}) => { + if (valueType === "boolean") { + // antd Select option values must be string|number — encode the + // boolean as a string and decode on change. + return ( + + size="small" + style={{minWidth: 96}} + placeholder="Value" + disabled={disabled} + value={value === true ? "true" : value === false ? "false" : undefined} + options={[ + {label: "true", value: "true"}, + {label: "false", value: "false"}, + ]} + onChange={(v) => onChange(v === "true")} + /> + ) + } + if (valueType === "number") { + return ( + onChange(v ?? "")} + /> + ) + } + return ( + onChange(e.target.value)} + /> + ) +} + +export default ScenarioFilterBar diff --git a/web/oss/src/components/EvalRunDetails/etl/scenarioFilterState.ts b/web/oss/src/components/EvalRunDetails/etl/scenarioFilterState.ts new file mode 100644 index 0000000000..5b5203f05f --- /dev/null +++ b/web/oss/src/components/EvalRunDetails/etl/scenarioFilterState.ts @@ -0,0 +1,37 @@ +/** + * Scenario table filter state — the active multi-predicate filter + * (decision D8: flat AND/OR), one per run. + * + * The atom holds the *raw* filter the filter bar edits — it may contain + * half-built conditions (a column picked but no value yet). Evaluation + * uses `toEffectiveFilter`, which drops the incomplete ones, so a + * partially-typed condition never filters every row out. + */ + +import type {PredicateGroup, RowPredicate} from "@agenta/entities/evaluationRun/etl" +import {atom} from "jotai" +import {atomFamily} from "jotai/utils" + +const EMPTY_FILTER: PredicateGroup = {op: "and", conditions: []} + +/** Per-run active scenario filter (raw — may contain half-built conditions). */ +export const scenarioFilterAtomFamily = atomFamily((_runId: string) => + atom(EMPTY_FILTER), +) + +/** A condition is complete once it has a column and a defined, non-empty value. */ +export const isConditionComplete = (c: RowPredicate): boolean => + !!c.columnName && c.value !== undefined && c.value !== "" + +/** + * The filter actually evaluated — half-built conditions dropped. Returns + * the same `op` with only complete conditions. + */ +export const toEffectiveFilter = (group: PredicateGroup): PredicateGroup => ({ + op: group.op, + conditions: group.conditions.filter(isConditionComplete), +}) + +/** True when at least one complete condition is set. */ +export const isScenarioFilterActive = (group: PredicateGroup): boolean => + group.conditions.some(isConditionComplete) diff --git a/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx b/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx index 9ab25483de..ecaf63bfcc 100644 --- a/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx @@ -82,6 +82,11 @@ export const useEtlColumns = ({ ellipsis: true, align: "left" as const, render: (_: unknown, record: PreviewTableRow) => { + // antd's virtual table can briefly call a cell + // render with an out-of-range `undefined` record + // while the (filtered) dataSource is shrinking — + // render nothing for those phantom rows. + if (record == null) return null // Skeleton / not-yet-keyed rows (incl. comparison // placeholders) render a fixed-height placeholder. if (record.__isSkeleton || !record.scenarioId) { diff --git a/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts b/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts index b2e07a5dc6..ac8f1dbd02 100644 --- a/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts +++ b/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts @@ -31,6 +31,7 @@ import {evaluationResultMolecule, evaluationMetricMolecule} from "@agenta/entiti import { predicateToEntitySlices, type EntitySlice, + type PredicateGroup, type RowPredicate, type RunSchema, } from "@agenta/entities/evaluationRun/etl" @@ -98,8 +99,12 @@ export interface UseHydrateScenariosArgs { rows: readonly HydratableRowRef[] /** Run schema — maps an active predicate's column to entity slices. */ schema?: RunSchema | null - /** Active predicate(s) — Phase 2 filtering. Null in Phase 1. */ - predicate?: RowPredicate | RowPredicate[] | null + /** + * Active filter — a single predicate, a predicate array, or a flat + * AND/OR `PredicateGroup` (Phase 2). When present, page-level hydrate + * fetches the entity slices the filter needs so it can be evaluated. + */ + predicate?: RowPredicate | RowPredicate[] | PredicateGroup | null /** Hydrate strategy — see `SliceFetchMode`. Default "auto". */ sliceMode?: SliceFetchMode } diff --git a/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts b/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts new file mode 100644 index 0000000000..bc81f303ea --- /dev/null +++ b/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts @@ -0,0 +1,202 @@ +/** + * useScenarioFilter — applies the active multi-predicate filter (D8) to + * the scenario rows. + * + * A row is kept when its resolved columns satisfy the filter. Rows whose + * slices are not hydrated yet are kept visible ("keep-visible-until-known") + * so a real match is never hidden while it loads; the viewport-fill loop + * and the confirmed-match count gate on *confirmed* matches only. + * + * Because a strict filter can reduce the visible row count below the + * viewport height, the IVT's scroll-triggered `loadMore` may never fire. + * While a filter is active this hook drives `loadNextPage` itself until + * enough confirmed matches accumulate or the dataset is exhausted. + */ + +import {useEffect, useMemo} from "react" + +import {evaluationResultMolecule, evaluationMetricMolecule} from "@agenta/entities/evaluationRun" +import { + evaluateRowFilter, + resolveMappings, + type HydratedScenarioRow, + type PredicateGroup, + type ResolvedColumn, + type RunSchema, +} from "@agenta/entities/evaluationRun/etl" +import {useQueryClient, type QueryClient} from "@tanstack/react-query" +import {useAtomValue} from "jotai" + +import { + scenarioFilterAtomFamily, + isScenarioFilterActive, + toEffectiveFilter, +} from "./scenarioFilterState" +import {hydrationVersionAtom} from "./useHydrateScenarios" + +/** Enough confirmed matches to fill a typical viewport before the loop stops. */ +const VIEWPORT_FILL_TARGET = 30 + +interface FilterableRow { + scenarioId?: unknown + __isSkeleton?: unknown +} + +/** + * Build a row's resolved columns from the molecule caches. Returns `null` + * when nothing is hydrated yet for the scenario (results + metrics both + * empty) — the caller treats that as "not known yet". + */ +function resolveScenarioColumnsFromCache( + queryClient: QueryClient, + projectId: string, + runId: string, + scenarioId: string, + schema: RunSchema, +): ResolvedColumn[] | null { + const results = (evaluationResultMolecule.get.byScenario({projectId, runId, scenarioId}) ?? + []) as HydratedScenarioRow["results"] + const metrics = (evaluationMetricMolecule.get.byScenario({projectId, runId, scenarioId}) ?? + []) as HydratedScenarioRow["metrics"] + if (results.length === 0 && metrics.length === 0) return null + + const testcaseId = + results.find((r) => typeof r.testcase_id === "string" && r.testcase_id)?.testcase_id ?? null + const testcase = testcaseId + ? (queryClient.getQueryData([ + "testcase", + projectId, + testcaseId, + ]) ?? null) + : null + + const traces: Record = {} + for (const r of results) { + if (typeof r.trace_id === "string" && r.trace_id) { + const cached = queryClient.getQueryData([ + "trace-entity", + projectId, + r.trace_id, + ]) + if (cached != null) traces[r.trace_id] = cached + } + } + + return resolveMappings( + { + scenario: {id: scenarioId, status: "success"}, + results, + metrics, + testcase, + traces, + }, + {steps: schema.steps, mappings: schema.mappings}, + ) +} + +export interface UseScenarioFilterArgs { + projectId: string | null + runId: string | null + schema: RunSchema | null + /** The base (main-run) rows, pre-merge. */ + baseRows: readonly TRow[] + loadNextPage: () => void + hasMore: boolean + isFetching: boolean +} + +export interface UseScenarioFilterResult { + /** Raw filter (may contain half-built conditions) — for the filter bar. */ + rawFilter: PredicateGroup + /** Filter actually evaluated — half-built conditions dropped. */ + effectiveFilter: PredicateGroup + /** True when at least one complete condition is set. */ + active: boolean + /** Base rows after the filter — unfiltered when no filter is active. */ + filteredBaseRows: TRow[] + /** Rows confirmed (hydrated AND matching) to satisfy the filter. */ + confirmedMatchCount: number + /** True while the fill loop is still scanning with zero confirmed matches. */ + isScanning: boolean +} + +export function useScenarioFilter({ + projectId, + runId, + schema, + baseRows, + loadNextPage, + hasMore, + isFetching, +}: UseScenarioFilterArgs): UseScenarioFilterResult { + const queryClient = useQueryClient() + const rawFilter = useAtomValue(scenarioFilterAtomFamily(runId ?? "__none__")) + // Re-evaluate when the molecule caches change. + const hydrationVersion = useAtomValue(hydrationVersionAtom) + + const effectiveFilter = useMemo(() => toEffectiveFilter(rawFilter), [rawFilter]) + const active = isScenarioFilterActive(rawFilter) + + const filteredBaseRows = useMemo(() => { + if (!active || !schema || !projectId || !runId) return baseRows as TRow[] + return (baseRows as TRow[]).filter((r) => { + const scenarioId = typeof r.scenarioId === "string" ? r.scenarioId : null + // Skeleton / not-yet-keyed rows pass — they can't be evaluated. + if (r.__isSkeleton || !scenarioId) return true + const cols = resolveScenarioColumnsFromCache( + queryClient, + projectId, + runId, + scenarioId, + schema, + ) + // Not hydrated yet — keep visible until known. + if (!cols) return true + return evaluateRowFilter(effectiveFilter, cols) + }) + }, [baseRows, active, schema, projectId, runId, effectiveFilter, hydrationVersion, queryClient]) + + // Count of CONFIRMED matches — hydrated AND actually satisfying the + // filter. Excludes "keep-visible-until-known" rows, so it does not + // oscillate as pages hydrate. + const confirmedMatchCount = useMemo(() => { + if (!active || !schema || !projectId || !runId) return 0 + let n = 0 + for (const r of baseRows as TRow[]) { + const scenarioId = typeof r.scenarioId === "string" ? r.scenarioId : null + if (r.__isSkeleton || !scenarioId) continue + const cols = resolveScenarioColumnsFromCache( + queryClient, + projectId, + runId, + scenarioId, + schema, + ) + if (!cols) continue + if (evaluateRowFilter(effectiveFilter, cols)) n += 1 + } + return n + }, [baseRows, active, schema, projectId, runId, effectiveFilter, hydrationVersion, queryClient]) + + // Viewport-fill loop — a strict filter may keep the visible row count + // below the viewport, so IVT's scroll-triggered loadMore never fires. + // Drive it ourselves until enough confirmed matches accumulate or the + // dataset is exhausted. + useEffect(() => { + if (!active) return + if (!hasMore || isFetching) return + if (confirmedMatchCount >= VIEWPORT_FILL_TARGET) return + loadNextPage() + }, [active, hasMore, isFetching, confirmedMatchCount, loadNextPage]) + + const isScanning = active && confirmedMatchCount === 0 && hasMore + + return { + rawFilter, + effectiveFilter, + active, + filteredBaseRows, + confirmedMatchCount, + isScanning, + } +} From 5249e9e2e4d631ff0020a5f0aafbf9701a444332 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 12:15:17 +0200 Subject: [PATCH 09/29] feat(oss): scope v1 scenario filtering to metric-related columns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only evaluator-output and metric columns are offered in the filter bar for v1 — testset (input) and application (output) columns are withheld behind a UI allowlist (FILTERABLE_COLUMN_KINDS). The filter engine and buildFilterSchema still support every column kind, so enabling the rest later is a one-line flip — no structural change. --- .../EvalRunDetails/etl/ScenarioFilterBar.tsx | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx index 65a45928e0..9ffb8c378e 100644 --- a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx @@ -16,6 +16,7 @@ import {useCallback, useMemo} from "react" import { buildFilterSchema, + type ColumnGroup, type FilterOperator, type FilterValueType, type RowPredicate, @@ -42,6 +43,23 @@ const OP_LABELS: Record = { // engine but need an array-value input — deferred from this v1 bar. const UI_OPERATORS: FilterOperator[] = ["eq", "ne", "lt", "lte", "gt", "gte"] +/** + * v1 column-kind allowlist for filtering. Only metric-related columns + * (evaluator outputs + metrics) are offered for now; testset (input) and + * application (output) columns are deliberately withheld. + * + * This is a UI allowlist only — the filter engine (`evaluateRowFilter`, + * `predicateToEntitySlices`) supports every kind. Flip a kind to `true` + * here to enable it; no other change is needed. + */ +const FILTERABLE_COLUMN_KINDS: Record = { + evaluator: true, + metrics: true, + testset: false, + application: false, + other: false, +} + const NUMERIC_HINTS = [ "score", "cost", @@ -85,7 +103,10 @@ const ScenarioFilterBar = ({runId, schema}: ScenarioFilterBarProps) => { const [filter, setFilter] = useAtom(scenarioFilterAtomFamily(runId)) const fields = useMemo( - () => buildFilterSchema(schema, {resolveValueType: heuristicValueType}).fields, + () => + buildFilterSchema(schema, {resolveValueType: heuristicValueType}).fields.filter( + (f) => FILTERABLE_COLUMN_KINDS[f.groupKind], + ), [schema], ) const fieldByKey = useMemo(() => new Map(fields.map((f) => [encodeField(f), f])), [fields]) From 4efc57a8690628ed20121608449308c7e319434c Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 12:25:15 +0200 Subject: [PATCH 10/29] fix(oss): type scenario filter columns from the evaluator output schema MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The filter bar typed columns with a name heuristic, so a boolean evaluator output (e.g. an LLM-judge field) was offered numeric comparators and a numeric value input. Column value types now come from the evaluator output schema: the columnResult column `metricType` — itself read from each output property's JSON-schema `type` by `extractMetrics` — maps to the filter value type via `buildColumnValueTypeResolver`. A boolean column now gets only equals / not-equals and a true/false input; a numeric one gets the comparators. The name heuristic is removed. --- .../src/components/EvalRunDetails/Table.tsx | 15 +++- .../EvalRunDetails/etl/ScenarioFilterBar.tsx | 45 ++++------- .../EvalRunDetails/etl/columnValueTypes.ts | 76 +++++++++++++++++++ 3 files changed, 104 insertions(+), 32 deletions(-) create mode 100644 web/oss/src/components/EvalRunDetails/etl/columnValueTypes.ts diff --git a/web/oss/src/components/EvalRunDetails/Table.tsx b/web/oss/src/components/EvalRunDetails/Table.tsx index e4e925d3ad..ae9467449b 100644 --- a/web/oss/src/components/EvalRunDetails/Table.tsx +++ b/web/oss/src/components/EvalRunDetails/Table.tsx @@ -27,6 +27,7 @@ import {DEFAULT_SCENARIO_PAGE_SIZE, evaluationRunQueryAtomFamily} from "./atoms/ import type {PreviewTableRow} from "./atoms/tableRows" import ScenarioColumnVisibilityPopoverContent from "./components/columnVisibility/ColumnVisibilityPopoverContent" import {CellMaterializerContext} from "./etl/cellMaterializerContext" +import {buildColumnValueTypeResolver} from "./etl/columnValueTypes" import ScenarioFilterBar from "./etl/ScenarioFilterBar" import {useCellMaterialization} from "./etl/useCellMaterialization" import {useEtlColumns} from "./etl/useEtlColumns" @@ -100,6 +101,14 @@ const EvalRunDetailsTable = ({ const previewColumns = usePreviewColumns({columnResult, evaluationType}) + // Filter column value types — sourced from the evaluator output + // schemas (column `metricType`), so the filter bar offers the right + // operators + value input per column. + const filterValueTypeResolver = useMemo( + () => buildColumnValueTypeResolver(columnResult), + [columnResult], + ) + // ── ETL schema columns + self-hydrating cells (Phase 1 — T2 + T3) ── // The schema columns (testset / application / evaluator / metrics / // other) are derived from the run graph and rendered by cells that @@ -947,7 +956,11 @@ const EvalRunDetailsTable = ({ return (
- +
/* diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx index 9ffb8c378e..5ca16c8f98 100644 --- a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx @@ -7,9 +7,11 @@ * AND/OR operator. The bar writes a `PredicateGroup` to the per-run * filter atom — `useScenarioFilter` reads it and filters the rows. * - * Value typing is best-effort: the run schema does not carry column - * types, so a name heuristic refines `buildFilterSchema`'s default - * (numeric for metrics, unknown otherwise) for the common cases. + * Column value types come from the evaluator output schema via the + * `resolveValueType` prop (see `columnValueTypes`). That drives the + * operator set and the value input — a boolean output gets only + * equality operators + a true/false input, a numeric one gets the + * comparators. */ import {useCallback, useMemo} from "react" @@ -60,29 +62,6 @@ const FILTERABLE_COLUMN_KINDS: Record = { other: false, } -const NUMERIC_HINTS = [ - "score", - "cost", - "latency", - "duration", - "token", - "count", - "rate", - "ratio", - "total", - "avg", - "mean", -] -const BOOLEAN_HINTS = ["success", "passed", "failed", "correct", "valid", "match", "is_", "has_"] - -/** Name-based value-type heuristic — refines the schema-only default. */ -function heuristicValueType(field: {columnName: string}): FilterValueType | undefined { - const n = field.columnName.toLowerCase() - if (NUMERIC_HINTS.some((h) => n.includes(h))) return "number" - if (BOOLEAN_HINTS.some((h) => n.includes(h))) return "boolean" - return undefined -} - const encodeField = (f: {groupKind: string; groupSlug?: string | null; columnName: string}) => `${f.groupKind}|${f.groupSlug ?? ""}|${f.columnName}` @@ -97,17 +76,23 @@ const blankCondition = (): RowPredicate => ({ export interface ScenarioFilterBarProps { runId: string schema: RunSchema | null + /** Column value-type resolver, sourced from the evaluator output schema. */ + resolveValueType: (field: { + groupKind: string + groupSlug: string | null + columnName: string + }) => FilterValueType | undefined } -const ScenarioFilterBar = ({runId, schema}: ScenarioFilterBarProps) => { +const ScenarioFilterBar = ({runId, schema, resolveValueType}: ScenarioFilterBarProps) => { const [filter, setFilter] = useAtom(scenarioFilterAtomFamily(runId)) const fields = useMemo( () => - buildFilterSchema(schema, {resolveValueType: heuristicValueType}).fields.filter( + buildFilterSchema(schema, {resolveValueType}).fields.filter( (f) => FILTERABLE_COLUMN_KINDS[f.groupKind], ), - [schema], + [schema, resolveValueType], ) const fieldByKey = useMemo(() => new Map(fields.map((f) => [encodeField(f), f])), [fields]) const fieldOptions = useMemo( @@ -173,7 +158,6 @@ const ScenarioFilterBar = ({runId, schema}: ScenarioFilterBarProps) => { {filter.op} )} - size="small" placeholder="Column" style={{minWidth: 200}} showSearch @@ -196,7 +180,6 @@ const ScenarioFilterBar = ({runId, schema}: ScenarioFilterBarProps) => { }} /> - size="small" style={{minWidth: 104}} value={condition.op} disabled={!field} diff --git a/web/oss/src/components/EvalRunDetails/etl/columnValueTypes.ts b/web/oss/src/components/EvalRunDetails/etl/columnValueTypes.ts new file mode 100644 index 0000000000..6195b9c8d1 --- /dev/null +++ b/web/oss/src/components/EvalRunDetails/etl/columnValueTypes.ts @@ -0,0 +1,76 @@ +/** + * columnValueTypes — resolves a filterable column's value type from the + * evaluator output schema. + * + * The run graph does not carry column value types. The authoritative + * source is the evaluator's JSON output schema: `extractMetrics` + * (entities) reads each output property's `schema.type` into + * `MetricColumnDefinition.metricType`, and the backend-metadata column + * builder copies that onto every annotation column as + * `EvaluationTableColumn.metricType`. + * + * This module turns that `metricType` into the `FilterValueType` the + * filter bar uses, so a boolean evaluator output (e.g. an LLM-judge + * `success` field) is offered only equality operators and a true/false + * input — never the numeric comparators. + */ + +import type {FilterValueType} from "@agenta/entities/evaluationRun/etl" + +import type {EvaluationTableColumnsResult} from "../atoms/table" + +/** Map a JSON-schema-derived `metricType` to a filter value type. */ +function metricTypeToValueType(metricType: string | undefined): FilterValueType | undefined { + if (!metricType) return undefined + switch (metricType.toLowerCase()) { + case "boolean": + case "bool": + return "boolean" + case "number": + case "integer": + case "float": + return "number" + case "string": + return "string" + default: + // array / object / anything else — no safe operator set. + return "unknown" + } +} + +export interface ColumnValueTypeField { + groupKind: string + groupSlug: string | null + columnName: string +} + +/** + * Build a `resolveValueType` callback for `buildFilterSchema`, sourced + * from the evaluator output schemas (via `columnResult` column + * `metricType`). Returns `undefined` for a column with no known type so + * `buildFilterSchema` falls back to its schema-only default. + */ +export function buildColumnValueTypeResolver( + columnResult: EvaluationTableColumnsResult | undefined, +): (field: ColumnValueTypeField) => FilterValueType | undefined { + // Keyed by `::` (disambiguates two + // evaluators with same-named outputs) and by column name alone. + const bySlugName = new Map() + const byName = new Map() + + for (const col of columnResult?.columns ?? []) { + const metricType = col.metricType + const name = col.label + if (!metricType || typeof name !== "string" || !name) continue + byName.set(name, metricType) + if (col.evaluatorSlug) bySlugName.set(`${col.evaluatorSlug}::${name}`, metricType) + } + + return (field) => { + const metricType = + (field.groupSlug + ? bySlugName.get(`${field.groupSlug}::${field.columnName}`) + : undefined) ?? byName.get(field.columnName) + return metricTypeToValueType(metricType) + } +} From 67ac220bfe4ae8bced6c790430c829152d0db2d2 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 12:33:04 +0200 Subject: [PATCH 11/29] fix(entities): resolve nullable evaluator output types in extractMetrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit extractMetrics typed an output property only when `schema.type` was a plain string, so a nullable field (`type: ["boolean", "null"]` or `anyOf: [{type: "boolean"}, {type: "null"}]`) fell back to "string" — which made the scenario filter bar offer a boolean field a text input instead of true/false. resolveSchemaType (new, dependency-free module) unwraps array and anyOf/oneOf nullable encodings to the first non-"null" type. --- .../core/__tests__/schemaType.test.ts | 42 +++++++++++++++++++ .../src/workflow/core/evaluatorResolution.ts | 3 +- .../src/workflow/core/schemaType.ts | 39 +++++++++++++++++ 3 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 web/packages/agenta-entities/src/workflow/core/__tests__/schemaType.test.ts create mode 100644 web/packages/agenta-entities/src/workflow/core/schemaType.ts diff --git a/web/packages/agenta-entities/src/workflow/core/__tests__/schemaType.test.ts b/web/packages/agenta-entities/src/workflow/core/__tests__/schemaType.test.ts new file mode 100644 index 0000000000..426a3cb856 --- /dev/null +++ b/web/packages/agenta-entities/src/workflow/core/__tests__/schemaType.test.ts @@ -0,0 +1,42 @@ +/** + * resolveSchemaType — regression guard for nullable evaluator output + * types. + * + * An evaluator output property declared nullable (`type: ["boolean", + * "null"]` or `anyOf: [{type: "boolean"}, {type: "null"}]`) must still + * resolve to its primitive type — otherwise the scenario filter bar + * mistypes a boolean field and offers numeric operators. + */ + +import assert from "node:assert/strict" +import {describe, it} from "node:test" + +import {resolveSchemaType} from "../schemaType" + +describe("resolveSchemaType", () => { + it("returns a plain string type", () => { + assert.equal(resolveSchemaType({type: "boolean"}), "boolean") + assert.equal(resolveSchemaType({type: "number"}), "number") + }) + + it("treats a bare 'null' type as no type", () => { + assert.equal(resolveSchemaType({type: "null"}), undefined) + }) + + it("unwraps a nullable array type — first non-null entry", () => { + assert.equal(resolveSchemaType({type: ["boolean", "null"]}), "boolean") + assert.equal(resolveSchemaType({type: ["null", "number"]}), "number") + }) + + it("unwraps a nullable anyOf / oneOf union", () => { + assert.equal(resolveSchemaType({anyOf: [{type: "boolean"}, {type: "null"}]}), "boolean") + assert.equal(resolveSchemaType({oneOf: [{type: "null"}, {type: "string"}]}), "string") + }) + + it("returns undefined when no type is resolvable", () => { + assert.equal(resolveSchemaType({}), undefined) + assert.equal(resolveSchemaType(null), undefined) + assert.equal(resolveSchemaType(undefined), undefined) + assert.equal(resolveSchemaType({anyOf: [{type: "null"}]}), undefined) + }) +}) diff --git a/web/packages/agenta-entities/src/workflow/core/evaluatorResolution.ts b/web/packages/agenta-entities/src/workflow/core/evaluatorResolution.ts index 99474b7c7c..76c44cb3dd 100644 --- a/web/packages/agenta-entities/src/workflow/core/evaluatorResolution.ts +++ b/web/packages/agenta-entities/src/workflow/core/evaluatorResolution.ts @@ -14,6 +14,7 @@ import {resolveSchemaRef} from "../../runnable/portHelpers" import {resolveOutputSchema, resolveOutputSchemaProperties} from "./schema" +import {resolveSchemaType} from "./schemaType" // ============================================================================ // TYPES @@ -135,7 +136,7 @@ export const extractMetrics = (evaluator: { kind: "metric" as const, path: key, stepKey: evaluator.slug || evaluator.id || "metric", - metricType: typeof schema?.type === "string" ? schema.type : METRIC_TYPE_FALLBACK, + metricType: resolveSchemaType(schema) ?? METRIC_TYPE_FALLBACK, displayLabel: typeof schema?.title === "string" ? schema.title : titleize(key), description: typeof schema?.description === "string" ? schema.description : undefined, } diff --git a/web/packages/agenta-entities/src/workflow/core/schemaType.ts b/web/packages/agenta-entities/src/workflow/core/schemaType.ts new file mode 100644 index 0000000000..b50e37204d --- /dev/null +++ b/web/packages/agenta-entities/src/workflow/core/schemaType.ts @@ -0,0 +1,39 @@ +/** + * resolveSchemaType — resolve a JSON-schema node's primitive type. + * + * Tolerates the nullable encodings an evaluator output schema may use: + * - `type: "boolean"` — plain + * - `type: ["boolean", "null"]` — array / nullable + * - `anyOf | oneOf: [{type: "boolean"}, {type: "null"}]` — union / nullable + * + * Returns the first non-`"null"` type found, or `undefined` when none is + * resolvable. + * + * Kept in its own dependency-free module so it is unit-testable without + * pulling in the evaluator-resolution import graph. + * + * @packageDocumentation + */ + +export const resolveSchemaType = ( + schema: Record | null | undefined, +): string | undefined => { + if (!schema || typeof schema !== "object") return undefined + + const type = schema.type + if (typeof type === "string") return type === "null" ? undefined : type + if (Array.isArray(type)) { + const first = type.find((t) => typeof t === "string" && t !== "null") + if (typeof first === "string") return first + } + + for (const key of ["anyOf", "oneOf"] as const) { + const branches = schema[key] + if (!Array.isArray(branches)) continue + for (const branch of branches) { + const resolved = resolveSchemaType(branch as Record) + if (resolved) return resolved + } + } + return undefined +} From e9abd359086e419ec895b5f14e31cdf8135712be Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 12:38:59 +0200 Subject: [PATCH 12/29] chore(oss): log evaluator schema on scenario filter column select MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Temporary diagnostic — on column select, logs the picked column's resolved value type plus the backend column metadata (per-column metricType and the evaluator definitions, including each evaluator's raw output schema) so a mistyped boolean field can be investigated. Remove once the type-resolution issue is resolved. --- .../src/components/EvalRunDetails/Table.tsx | 1 + .../EvalRunDetails/etl/ScenarioFilterBar.tsx | 40 +++++++++++++++++-- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/web/oss/src/components/EvalRunDetails/Table.tsx b/web/oss/src/components/EvalRunDetails/Table.tsx index ae9467449b..d36d180877 100644 --- a/web/oss/src/components/EvalRunDetails/Table.tsx +++ b/web/oss/src/components/EvalRunDetails/Table.tsx @@ -960,6 +960,7 @@ const EvalRunDetailsTable = ({ runId={runId} schema={runSchema} resolveValueType={filterValueTypeResolver} + columnResult={columnResult} />
diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx index 5ca16c8f98..6ed010d61e 100644 --- a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx @@ -28,6 +28,8 @@ import {Button, Input, InputNumber, Segmented, Select, Tooltip} from "antd" import {useAtom} from "jotai" import {Plus, X} from "lucide-react" +import type {EvaluationTableColumnsResult} from "../atoms/table" + import {scenarioFilterAtomFamily, isScenarioFilterActive} from "./scenarioFilterState" const OP_LABELS: Record = { @@ -82,9 +84,16 @@ export interface ScenarioFilterBarProps { groupSlug: string | null columnName: string }) => FilterValueType | undefined + /** Backend column metadata — used only for the column-select debug log. */ + columnResult?: EvaluationTableColumnsResult } -const ScenarioFilterBar = ({runId, schema, resolveValueType}: ScenarioFilterBarProps) => { +const ScenarioFilterBar = ({ + runId, + schema, + resolveValueType, + columnResult, +}: ScenarioFilterBarProps) => { const [filter, setFilter] = useAtom(scenarioFilterAtomFamily(runId)) const fields = useMemo( @@ -167,6 +176,32 @@ const ScenarioFilterBar = ({runId, schema, resolveValueType}: ScenarioFilterBarP onChange={(value) => { const picked = fieldByKey.get(value) if (!picked) return + // [debug] surface the schema info for the + // picked column so a mistyped field (e.g. + // a boolean shown with a text input) can + // be diagnosed. Remove once resolved. + + console.info("[ScenarioFilter][debug] column selected", { + field: { + groupKind: picked.groupKind, + groupSlug: picked.groupSlug, + columnName: picked.columnName, + }, + resolvedValueType: picked.valueType, + resolvedOperators: picked.operators, + columnResultMatches: (columnResult?.columns ?? []) + .filter((c) => c.label === picked.columnName) + .map((c) => ({ + label: c.label, + metricType: c.metricType, + kind: c.kind, + stepType: c.stepType, + evaluatorSlug: c.evaluatorSlug, + evaluatorId: c.evaluatorId, + metricKey: c.metricKey, + })), + evaluators: columnResult?.evaluators, + }) const nextOps = UI_OPERATORS.filter((o) => picked.operators.includes(o), ) @@ -243,7 +278,6 @@ const ConditionValueInput = ({ // boolean as a string and decode on change. return ( - size="small" style={{minWidth: 96}} placeholder="Value" disabled={disabled} @@ -259,7 +293,6 @@ const ConditionValueInput = ({ if (valueType === "number") { return ( Date: Fri, 22 May 2026 12:52:47 +0200 Subject: [PATCH 13/29] fix(oss): match evaluator metric definitions by the bare output key MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The scenario column builder matched an evaluator's metric definition only by the canonical metric key (e.g. "attributes.ag.data.outputs.score"). But `extractMetrics` keys metrics by the output-schema property name — the bare key ("score") — so the match missed and `metricType` fell back to "string". A boolean evaluator output (e.g. an LLM-judge `score`) was therefore offered text / numeric filter operators instead of a true/false input. The lookup now also matches on the bare `valueKey`. Also removes the temporary column-select diagnostic log. --- .../src/components/EvalRunDetails/Table.tsx | 1 - .../EvalRunDetails/atoms/table/columns.ts | 12 +++++- .../EvalRunDetails/etl/ScenarioFilterBar.tsx | 37 +------------------ 3 files changed, 12 insertions(+), 38 deletions(-) diff --git a/web/oss/src/components/EvalRunDetails/Table.tsx b/web/oss/src/components/EvalRunDetails/Table.tsx index d36d180877..ae9467449b 100644 --- a/web/oss/src/components/EvalRunDetails/Table.tsx +++ b/web/oss/src/components/EvalRunDetails/Table.tsx @@ -960,7 +960,6 @@ const EvalRunDetailsTable = ({ runId={runId} schema={runSchema} resolveValueType={filterValueTypeResolver} - columnResult={columnResult} />
diff --git a/web/oss/src/components/EvalRunDetails/atoms/table/columns.ts b/web/oss/src/components/EvalRunDetails/atoms/table/columns.ts index 1607367e65..57f7c82ab7 100644 --- a/web/oss/src/components/EvalRunDetails/atoms/table/columns.ts +++ b/web/oss/src/components/EvalRunDetails/atoms/table/columns.ts @@ -490,9 +490,19 @@ const tableColumnsBaseAtomFamily = atomFamily((runId: string | null) => } const evaluator = column.evaluatorId ? evaluatorById.get(column.evaluatorId) : undefined + // Match the evaluator's metric definition by the canonical + // metric key (e.g. "attributes.ag.data.outputs.score") OR the + // bare value key (e.g. "score"). `extractMetrics` keys metrics + // by the output-schema property name — the bare key — so a + // canonical-key-only match misses and `metricType` falls back + // to "string", mis-typing the column (e.g. a boolean output). const metricKey = column.metricKey || column.valueKey const metricDefinition = evaluator?.metrics.find( - (metric) => metric.name === metricKey || metric.path === metricKey, + (metric) => + metric.name === metricKey || + metric.path === metricKey || + metric.name === column.valueKey || + metric.path === column.valueKey, ) const metricType = metricDefinition?.metricType || column.metricType || METRIC_TYPE_FALLBACK diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx index 6ed010d61e..25b54a2f1f 100644 --- a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx @@ -28,8 +28,6 @@ import {Button, Input, InputNumber, Segmented, Select, Tooltip} from "antd" import {useAtom} from "jotai" import {Plus, X} from "lucide-react" -import type {EvaluationTableColumnsResult} from "../atoms/table" - import {scenarioFilterAtomFamily, isScenarioFilterActive} from "./scenarioFilterState" const OP_LABELS: Record = { @@ -84,16 +82,9 @@ export interface ScenarioFilterBarProps { groupSlug: string | null columnName: string }) => FilterValueType | undefined - /** Backend column metadata — used only for the column-select debug log. */ - columnResult?: EvaluationTableColumnsResult } -const ScenarioFilterBar = ({ - runId, - schema, - resolveValueType, - columnResult, -}: ScenarioFilterBarProps) => { +const ScenarioFilterBar = ({runId, schema, resolveValueType}: ScenarioFilterBarProps) => { const [filter, setFilter] = useAtom(scenarioFilterAtomFamily(runId)) const fields = useMemo( @@ -176,32 +167,6 @@ const ScenarioFilterBar = ({ onChange={(value) => { const picked = fieldByKey.get(value) if (!picked) return - // [debug] surface the schema info for the - // picked column so a mistyped field (e.g. - // a boolean shown with a text input) can - // be diagnosed. Remove once resolved. - - console.info("[ScenarioFilter][debug] column selected", { - field: { - groupKind: picked.groupKind, - groupSlug: picked.groupSlug, - columnName: picked.columnName, - }, - resolvedValueType: picked.valueType, - resolvedOperators: picked.operators, - columnResultMatches: (columnResult?.columns ?? []) - .filter((c) => c.label === picked.columnName) - .map((c) => ({ - label: c.label, - metricType: c.metricType, - kind: c.kind, - stepType: c.stepType, - evaluatorSlug: c.evaluatorSlug, - evaluatorId: c.evaluatorId, - metricKey: c.metricKey, - })), - evaluators: columnResult?.evaluators, - }) const nextOps = UI_OPERATORS.filter((o) => picked.operators.includes(o), ) From 2df8e4904c4d60db689944d3727556d612727fab Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 13:43:36 +0200 Subject: [PATCH 14/29] fix(oss): stop scenario filter flicker while gathering data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "scanning" state — which hands the table a stable empty placeholder plus a steady loading overlay — only gated on `hasMore`. So it dropped to "not scanning" in the gap between a page loading and its hydrate batch starting, flashing partially hydrated rows and then the empty state while the filter gathered data. isScanning now also gates on the page-hydrate `isHydrating` flag (matching the PoC's `scanningEmpty`), so the table holds one steady loading state until the scan settles. The flag is computed in Table.tsx where both the filter result and the hydration progress are available. --- web/oss/src/components/EvalRunDetails/Table.tsx | 14 ++++++++++++-- .../EvalRunDetails/etl/useScenarioFilter.ts | 5 ----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/web/oss/src/components/EvalRunDetails/Table.tsx b/web/oss/src/components/EvalRunDetails/Table.tsx index ae9467449b..0a427c10d3 100644 --- a/web/oss/src/components/EvalRunDetails/Table.tsx +++ b/web/oss/src/components/EvalRunDetails/Table.tsx @@ -129,7 +129,7 @@ const EvalRunDetailsTable = ({ // Phase 2 — multi-predicate filtering (D8). Filters the base run's // rows; each comparison group follows its base row. - const {filteredBaseRows, effectiveFilter, isScanning} = useScenarioFilter({ + const {filteredBaseRows, effectiveFilter, active, confirmedMatchCount} = useScenarioFilter({ projectId, runId, schema: runSchema, @@ -144,7 +144,7 @@ const EvalRunDetailsTable = ({ // Page-level hydrate — predicate-aware: with an active filter it // fetches the entity slices the filter needs to be evaluated; with no // filter it is inert and cells materialize their own visible data. - useHydrateScenarios({ + const hydration = useHydrateScenarios({ projectId, runId, rows: basePagination.rows, @@ -153,6 +153,16 @@ const EvalRunDetailsTable = ({ sliceMode: "auto", }) + // "Scanning" — a filter is active, nothing has confirmed-matched yet, + // and the pipeline is still working (pages to load OR a hydrate batch + // in flight). Gating on `isHydrating` too (not just `hasMore`) is what + // stops the table flickering between the empty state and partially + // hydrated rows while it gathers data. + const isScanning = + active && + confirmedMatchCount === 0 && + (basePagination.paginationInfo.hasMore || hydration.isHydrating) + // Cell-side lazy materializer — coalesces visible cells' slice // requests into one bulk fetch per (slice, run). const cellMaterializer = useCellMaterialization({projectId, runId}) diff --git a/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts b/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts index bc81f303ea..28f8cc974f 100644 --- a/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts +++ b/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts @@ -116,8 +116,6 @@ export interface UseScenarioFilterResult { filteredBaseRows: TRow[] /** Rows confirmed (hydrated AND matching) to satisfy the filter. */ confirmedMatchCount: number - /** True while the fill loop is still scanning with zero confirmed matches. */ - isScanning: boolean } export function useScenarioFilter({ @@ -189,14 +187,11 @@ export function useScenarioFilter({ loadNextPage() }, [active, hasMore, isFetching, confirmedMatchCount, loadNextPage]) - const isScanning = active && confirmedMatchCount === 0 && hasMore - return { rawFilter, effectiveFilter, active, filteredBaseRows, confirmedMatchCount, - isScanning, } } From 3cee5855817a34e1fcc2c84639361756673616bf Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 13:54:24 +0200 Subject: [PATCH 15/29] fix(oss): materialize filtered scenario rows incrementally, no flicker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The filtered row list kept unhydrated rows "visible until known", so it grew and shrank as hydration revealed non-matches — flickering the table between rows and the empty state until a page finished materializing. filteredBaseRows now holds confirmed matches only: a row appears once it is hydrated AND matches, so the list only ever grows during a scan. Rows materialize as their data arrives, with no show-then-drop. The full empty + loading overlay shows only until the first match lands; after that a "N matches · scanning…" indicator in the filter bar covers the rest of the scan. --- .../src/components/EvalRunDetails/Table.tsx | 33 +++++++------- .../EvalRunDetails/etl/ScenarioFilterBar.tsx | 24 +++++++++- .../EvalRunDetails/etl/useScenarioFilter.ts | 45 +++++++------------ 3 files changed, 53 insertions(+), 49 deletions(-) diff --git a/web/oss/src/components/EvalRunDetails/Table.tsx b/web/oss/src/components/EvalRunDetails/Table.tsx index 0a427c10d3..7a4923ac99 100644 --- a/web/oss/src/components/EvalRunDetails/Table.tsx +++ b/web/oss/src/components/EvalRunDetails/Table.tsx @@ -50,10 +50,6 @@ import {patchFocusDrawerQueryParams} from "./state/urlFocusDrawer" type TableRowData = PreviewTableRow -// Stable empty reference for the table's rows while a filter is still -// scanning with zero confirmed matches — avoids per-page flicker. -const EMPTY_MERGED_ROWS: TableRowData[] = [] - interface EvalRunDetailsTableProps { runId: string evaluationType: "auto" | "human" | "online" @@ -153,15 +149,14 @@ const EvalRunDetailsTable = ({ sliceMode: "auto", }) - // "Scanning" — a filter is active, nothing has confirmed-matched yet, - // and the pipeline is still working (pages to load OR a hydrate batch - // in flight). Gating on `isHydrating` too (not just `hasMore`) is what - // stops the table flickering between the empty state and partially - // hydrated rows while it gathers data. - const isScanning = - active && - confirmedMatchCount === 0 && - (basePagination.paginationInfo.hasMore || hydration.isHydrating) + // The filter scan is still running — more pages to load OR a hydrate + // batch in flight. + const scanInProgress = + active && (basePagination.paginationInfo.hasMore || hydration.isHydrating) + // Nothing has confirmed-matched yet — show the full empty + loading + // overlay. Once the first match lands, rows show and grow (no overlay, + // no flicker — `filteredBaseRows` only ever grows during a scan). + const isScanning = scanInProgress && confirmedMatchCount === 0 // Cell-side lazy materializer — coalesces visible cells' slice // requests into one bulk fetch per (slice, run). @@ -409,14 +404,14 @@ const EvalRunDetailsTable = ({ const paginationForShell = useMemo>( () => ({ - // While a filter is still scanning with zero confirmed - // matches, hand the table a stable empty array so it shows one - // steady loading state instead of per-page row flicker. - rows: isScanning ? EMPTY_MERGED_ROWS : mergedRows, + // `mergedRows` is monotonic during a scan (confirmed matches + // only), so it can be handed to the table directly — no empty + // placeholder swap needed. + rows: mergedRows, loadNextPage: handleLoadMore, resetPages: handleResetPages, }), - [handleLoadMore, handleResetPages, mergedRows, isScanning], + [handleLoadMore, handleResetPages, mergedRows], ) // Build group map for export label resolution @@ -970,6 +965,8 @@ const EvalRunDetailsTable = ({ runId={runId} schema={runSchema} resolveValueType={filterValueTypeResolver} + scanning={scanInProgress} + matchCount={confirmedMatchCount} />
diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx index 25b54a2f1f..f637d4c304 100644 --- a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx @@ -26,7 +26,7 @@ import { } from "@agenta/entities/evaluationRun/etl" import {Button, Input, InputNumber, Segmented, Select, Tooltip} from "antd" import {useAtom} from "jotai" -import {Plus, X} from "lucide-react" +import {Loader2, Plus, X} from "lucide-react" import {scenarioFilterAtomFamily, isScenarioFilterActive} from "./scenarioFilterState" @@ -82,9 +82,19 @@ export interface ScenarioFilterBarProps { groupSlug: string | null columnName: string }) => FilterValueType | undefined + /** True while the filter scan is still running. */ + scanning?: boolean + /** Confirmed matches found so far. */ + matchCount?: number } -const ScenarioFilterBar = ({runId, schema, resolveValueType}: ScenarioFilterBarProps) => { +const ScenarioFilterBar = ({ + runId, + schema, + resolveValueType, + scanning = false, + matchCount = 0, +}: ScenarioFilterBarProps) => { const [filter, setFilter] = useAtom(scenarioFilterAtomFamily(runId)) const fields = useMemo( @@ -213,6 +223,16 @@ const ScenarioFilterBar = ({runId, schema, resolveValueType}: ScenarioFilterBarP Add filter + {active && ( + + {scanning && } + + {matchCount} {matchCount === 1 ? "match" : "matches"} + {scanning ? " · scanning…" : ""} + + + )} + {active && (
- ) - })} + updateCondition(index, {value})} + /> + +
+ ) + })} +
- {active && ( + +
+ +
+ + +
+
+
+ ) + + return ( +
+ + + + + {appliedCount > 0 && ( {scanning && } @@ -232,16 +297,6 @@ const ScenarioFilterBar = ({ )} - - {active && ( - - )}
) } @@ -263,7 +318,8 @@ const ConditionValueInput = ({ // boolean as a string and decode on change. return ( - style={{minWidth: 96}} + size="small" + className="w-full" placeholder="Value" disabled={disabled} value={value === true ? "true" : value === false ? "false" : undefined} @@ -271,6 +327,7 @@ const ConditionValueInput = ({ {label: "true", value: "true"}, {label: "false", value: "false"}, ]} + getPopupContainer={getWithinPopover} onChange={(v) => onChange(v === "true")} /> ) @@ -278,7 +335,8 @@ const ConditionValueInput = ({ if (valueType === "number") { return ( Date: Fri, 22 May 2026 14:01:10 +0200 Subject: [PATCH 17/29] fix(oss): inline AND/OR connector in scenario filter rows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the top-level "Match All/Any" segmented control with a row-level connector: the first row reads "Where", every later row shows a borderless ghost-style And/Or select. The group has a single op (flat group, D8), so the connectors stay in sync — toggling any one sets the group op. --- .../EvalRunDetails/etl/ScenarioFilterBar.tsx | 41 ++++++++++--------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx index 517a0df3e3..8aa8dfffb5 100644 --- a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx @@ -23,7 +23,7 @@ import { type RowPredicate, type RunSchema, } from "@agenta/entities/evaluationRun/etl" -import {Button, Divider, Input, InputNumber, Popover, Segmented, Select, Tooltip} from "antd" +import {Button, Divider, Input, InputNumber, Popover, Select, Tooltip} from "antd" import {useAtom} from "jotai" import {Filter as FilterIcon, Loader2, Plus, X} from "lucide-react" @@ -160,21 +160,6 @@ const ScenarioFilterBar = ({
Filter scenarios
- {conditions.length >= 2 && ( -
- Match - - size="small" - value={draft.op} - options={[ - {label: "All (AND)", value: "and"}, - {label: "Any (OR)", value: "or"}, - ]} - onChange={(op) => setDraft((d) => ({...d, op}))} - /> -
- )} -
{conditions.map((condition, index) => { const fieldKey = condition.columnName ? encodeField(condition) : undefined @@ -186,9 +171,27 @@ const ScenarioFilterBar = ({ return (
- - {index === 0 ? "Where" : draft.op === "and" ? "And" : "Or"} - + {/* + * Row-level AND/OR connector. The group has a + * single op (flat group — D8), so every + * connector shows and toggles the same value. + */} + {index === 0 ? ( + Where + ) : ( + + size="small" + variant="borderless" + className="w-16 shrink-0" + value={draft.op} + options={[ + {label: "And", value: "and"}, + {label: "Or", value: "or"}, + ]} + getPopupContainer={getWithinPopover} + onChange={(op) => setDraft((d) => ({...d, op}))} + /> + )} size="small" placeholder="Column" From e80759e4e9b90577187474e21cc78a429ad9c39a Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 14:03:27 +0200 Subject: [PATCH 18/29] fix(oss): align first scenario filter row with the rest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "Where" label and the And/Or connector select sat directly in the flex row with their own width classes, which resolved to different widths — so the first row's Column select didn't line up with the rows below. Both connectors now sit inside one shared fixed-width slot, so every row's Column select starts at the same x. --- .../EvalRunDetails/etl/ScenarioFilterBar.tsx | 44 +++++++++++-------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx index 8aa8dfffb5..f40a9624a3 100644 --- a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx @@ -172,26 +172,32 @@ const ScenarioFilterBar = ({ return (
{/* - * Row-level AND/OR connector. The group has a - * single op (flat group — D8), so every - * connector shows and toggles the same value. + * Row-level AND/OR connector in a fixed-width + * slot — so the Column select after it lines up + * across every row regardless of whether the + * connector is the "Where" label or the select. + * The group has a single op (flat group — D8), + * so every connector shows and toggles the same + * value. */} - {index === 0 ? ( - Where - ) : ( - - size="small" - variant="borderless" - className="w-16 shrink-0" - value={draft.op} - options={[ - {label: "And", value: "and"}, - {label: "Or", value: "or"}, - ]} - getPopupContainer={getWithinPopover} - onChange={(op) => setDraft((d) => ({...d, op}))} - /> - )} +
+ {index === 0 ? ( + Where + ) : ( + + size="small" + variant="borderless" + className="w-full" + value={draft.op} + options={[ + {label: "And", value: "and"}, + {label: "Or", value: "or"}, + ]} + getPopupContainer={getWithinPopover} + onChange={(op) => setDraft((d) => ({...d, op}))} + /> + )} +
size="small" placeholder="Column" From 20a5274aa40d108980689e855b8746dc183f9cf3 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 14:09:40 +0200 Subject: [PATCH 19/29] fix(oss): widen scenario filter connector slot to fit And/Or MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 64px connector slot truncated the default-size And/Or select to "A…". Widened to 80px so the connector shows in full. --- .../components/EvalRunDetails/etl/ScenarioFilterBar.tsx | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx index f40a9624a3..d434b63e02 100644 --- a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx @@ -180,12 +180,11 @@ const ScenarioFilterBar = ({ * so every connector shows and toggles the same * value. */} -
+
{index === 0 ? ( Where ) : ( - size="small" variant="borderless" className="w-full" value={draft.op} @@ -199,7 +198,6 @@ const ScenarioFilterBar = ({ )}
- size="small" placeholder="Column" className="w-[200px] shrink-0" showSearch @@ -223,7 +221,6 @@ const ScenarioFilterBar = ({ }} /> - size="small" className="w-[110px] shrink-0" value={condition.op} disabled={!field} @@ -327,7 +324,6 @@ const ConditionValueInput = ({ // boolean as a string and decode on change. return ( - size="small" className="w-full" placeholder="Value" disabled={disabled} @@ -344,7 +340,6 @@ const ConditionValueInput = ({ if (valueType === "number") { return ( Date: Fri, 22 May 2026 14:14:04 +0200 Subject: [PATCH 20/29] fix(oss): show filter "scanning" only while actually working MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The scanning indicator gated on `hasMore`, so it stayed on forever once the viewport-fill loop had stopped (it stops at the match target) — the dataset still "has more" pages, but the loop is idle. useScenarioFilter now exposes `isFilling` (the loop still intends to load pages). scanInProgress = isFilling OR a page fetch / hydrate batch in flight — so "scanning" turns off once enough matches are found and nothing is in flight, and reappears only while a (scroll-triggered) load is actually running. --- .../src/components/EvalRunDetails/Table.tsx | 28 +++++++++++-------- .../EvalRunDetails/etl/useScenarioFilter.ts | 13 +++++++++ 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/web/oss/src/components/EvalRunDetails/Table.tsx b/web/oss/src/components/EvalRunDetails/Table.tsx index 7a4923ac99..5cdd48a02e 100644 --- a/web/oss/src/components/EvalRunDetails/Table.tsx +++ b/web/oss/src/components/EvalRunDetails/Table.tsx @@ -125,15 +125,16 @@ const EvalRunDetailsTable = ({ // Phase 2 — multi-predicate filtering (D8). Filters the base run's // rows; each comparison group follows its base row. - const {filteredBaseRows, effectiveFilter, active, confirmedMatchCount} = useScenarioFilter({ - projectId, - runId, - schema: runSchema, - baseRows: basePagination.rows, - loadNextPage: basePagination.loadNextPage, - hasMore: basePagination.paginationInfo.hasMore, - isFetching: basePagination.paginationInfo.isFetching, - }) + const {filteredBaseRows, effectiveFilter, active, confirmedMatchCount, isFilling} = + useScenarioFilter({ + projectId, + runId, + schema: runSchema, + baseRows: basePagination.rows, + loadNextPage: basePagination.loadNextPage, + hasMore: basePagination.paginationInfo.hasMore, + isFetching: basePagination.paginationInfo.isFetching, + }) const etlColumns = useEtlColumns({projectId, runId, schema: runSchema}) @@ -149,10 +150,13 @@ const EvalRunDetailsTable = ({ sliceMode: "auto", }) - // The filter scan is still running — more pages to load OR a hydrate - // batch in flight. + // The filter scan is actively working — the viewport-fill loop still + // wants more pages (`isFilling`), a page is being fetched, or a + // hydrate batch is in flight. Once enough matches are found the loop + // stops, so this goes false even though the dataset has more pages + // (`hasMore`) — it only shows "scanning" while real work is happening. const scanInProgress = - active && (basePagination.paginationInfo.hasMore || hydration.isHydrating) + isFilling || (active && (basePagination.paginationInfo.isFetching || hydration.isHydrating)) // Nothing has confirmed-matched yet — show the full empty + loading // overlay. Once the first match lands, rows show and grow (no overlay, // no flicker — `filteredBaseRows` only ever grows during a scan). diff --git a/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts b/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts index a84afc12e9..7458d9fc2a 100644 --- a/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts +++ b/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts @@ -118,6 +118,14 @@ export interface UseScenarioFilterResult { filteredBaseRows: TRow[] /** Rows confirmed (hydrated AND matching) to satisfy the filter. */ confirmedMatchCount: number + /** + * True while the viewport-fill loop still intends to load more pages + * (a filter is active, the target match count is not yet reached, and + * the dataset has more pages). Goes false once enough matches are + * found — even if the dataset has more pages — so the UI can stop + * showing "scanning" when the loop is actually idle. + */ + isFilling: boolean } export function useScenarioFilter({ @@ -163,6 +171,10 @@ export function useScenarioFilter({ // match — so the count is just its length. const confirmedMatchCount = active ? filteredBaseRows.length : 0 + // The viewport-fill loop still wants more pages — i.e. the autonomous + // scan is genuinely in progress. + const isFilling = active && hasMore && confirmedMatchCount < VIEWPORT_FILL_TARGET + // Viewport-fill loop — a strict filter may keep the visible row count // below the viewport, so IVT's scroll-triggered loadMore never fires. // Drive it ourselves until enough confirmed matches accumulate or the @@ -180,5 +192,6 @@ export function useScenarioFilter({ active, filteredBaseRows, confirmedMatchCount, + isFilling, } } From 2a2cc9785ff31631b0ec6ddf74ceba9f5108f788 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 15:09:40 +0200 Subject: [PATCH 21/29] fix(oss): move the scenario filter into the run header row MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The filter sat on its own strip below the run header, taking a second line. It now renders inline in the "Evaluations:" header row (Scenarios tab only) — one line for everything. The filter bar is now self-contained: given a runId it derives the run schema, column value types, and live scan status from atoms. The scenarios table publishes its scan status (match count + scanning) via scenarioFilterStatusAtomFamily; the header's filter bar reads it. --- .../src/components/EvalRunDetails/Table.tsx | 31 ++++---- .../EvalRunDetails/components/Page.tsx | 2 +- .../components/PreviewEvalRunHeader.tsx | 72 +++++++++++-------- .../EvalRunDetails/etl/ScenarioFilterBar.tsx | 63 ++++++++-------- .../EvalRunDetails/etl/scenarioFilterState.ts | 19 +++++ 5 files changed, 108 insertions(+), 79 deletions(-) diff --git a/web/oss/src/components/EvalRunDetails/Table.tsx b/web/oss/src/components/EvalRunDetails/Table.tsx index 5cdd48a02e..35da206fa8 100644 --- a/web/oss/src/components/EvalRunDetails/Table.tsx +++ b/web/oss/src/components/EvalRunDetails/Table.tsx @@ -1,9 +1,9 @@ -import {useCallback, useMemo, useRef} from "react" +import {useCallback, useEffect, useMemo, useRef} from "react" import type {RunSchema} from "@agenta/entities/evaluationRun/etl" import {message} from "@agenta/ui/app-message" import clsx from "clsx" -import {useAtomValue, useStore} from "jotai" +import {useAtomValue, useSetAtom, useStore} from "jotai" import VirtualizedScenarioTableAnnotateDrawer from "@/oss/components/EvalRunDetails/components/AnnotateDrawer/VirtualizedScenarioTableAnnotateDrawer" import { @@ -27,8 +27,7 @@ import {DEFAULT_SCENARIO_PAGE_SIZE, evaluationRunQueryAtomFamily} from "./atoms/ import type {PreviewTableRow} from "./atoms/tableRows" import ScenarioColumnVisibilityPopoverContent from "./components/columnVisibility/ColumnVisibilityPopoverContent" import {CellMaterializerContext} from "./etl/cellMaterializerContext" -import {buildColumnValueTypeResolver} from "./etl/columnValueTypes" -import ScenarioFilterBar from "./etl/ScenarioFilterBar" +import {scenarioFilterStatusAtomFamily} from "./etl/scenarioFilterState" import {useCellMaterialization} from "./etl/useCellMaterialization" import {useEtlColumns} from "./etl/useEtlColumns" import {useHydrateScenarios} from "./etl/useHydrateScenarios" @@ -97,14 +96,6 @@ const EvalRunDetailsTable = ({ const previewColumns = usePreviewColumns({columnResult, evaluationType}) - // Filter column value types — sourced from the evaluator output - // schemas (column `metricType`), so the filter bar offers the right - // operators + value input per column. - const filterValueTypeResolver = useMemo( - () => buildColumnValueTypeResolver(columnResult), - [columnResult], - ) - // ── ETL schema columns + self-hydrating cells (Phase 1 — T2 + T3) ── // The schema columns (testset / application / evaluator / metrics / // other) are derived from the run graph and rendered by cells that @@ -162,6 +153,15 @@ const EvalRunDetailsTable = ({ // no flicker — `filteredBaseRows` only ever grows during a scan). const isScanning = scanInProgress && confirmedMatchCount === 0 + // Publish the scan status so the filter bar — which lives in the run + // header, a separate part of the tree — can show the match count. + const setFilterStatus = useSetAtom( + useMemo(() => scenarioFilterStatusAtomFamily(runId), [runId]), + ) + useEffect(() => { + setFilterStatus({matchCount: confirmedMatchCount, scanning: scanInProgress}) + }, [setFilterStatus, confirmedMatchCount, scanInProgress]) + // Cell-side lazy materializer — coalesces visible cells' slice // requests into one bulk fetch per (slice, run). const cellMaterializer = useCellMaterialization({projectId, runId}) @@ -965,13 +965,6 @@ const EvalRunDetailsTable = ({ return (
-
/* diff --git a/web/oss/src/components/EvalRunDetails/components/Page.tsx b/web/oss/src/components/EvalRunDetails/components/Page.tsx index ebad5df214..7369ee14cc 100644 --- a/web/oss/src/components/EvalRunDetails/components/Page.tsx +++ b/web/oss/src/components/EvalRunDetails/components/Page.tsx @@ -140,7 +140,7 @@ const EvalRunPreviewPage = ({runId, evaluationType, projectId = null}: EvalRunPr headerClassName="px-4 pt-2" >
- + { const _invocationRefs = useAtomValue(useMemo(() => runInvocationRefsAtomFamily(runId), [runId])) const _testsetIds = useAtomValue(useMemo(() => runTestsetIdsAtomFamily(runId), [runId])) @@ -173,38 +176,45 @@ const PreviewEvalRunMeta = ({ return (
-
- Evaluations: -
- {runDescriptors.map((run, index) => { - const isBaseRun = index === 0 - const tagColor = getComparisonSolidColor(index) - return ( - - ) : undefined - } - onClose={ - !isBaseRun - ? (event) => { - event.preventDefault() - setCompareRunIds((prev) => - prev.filter((id) => id !== run.id), - ) - } - : undefined - } - /> - ) - })} +
+
+ Evaluations: +
+ {runDescriptors.map((run, index) => { + const isBaseRun = index === 0 + const tagColor = getComparisonSolidColor(index) + return ( + + ) : undefined + } + onClose={ + !isBaseRun + ? (event) => { + event.preventDefault() + setCompareRunIds((prev) => + prev.filter((id) => id !== run.id), + ) + } + : undefined + } + /> + ) + })} +
+ {activeView === "scenarios" ? ( +
+ +
+ ) : null}
diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx index d434b63e02..973742d188 100644 --- a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx @@ -2,14 +2,13 @@ * ScenarioFilterBar — multi-condition AND/OR filter for the evaluation * run scenarios table (decision D8). * - * Follows the observability `Filters` pattern: a compact "Filters" button - * opens a popover holding the condition rows, so the conditions never - * take over the page layout. Edits are staged in a draft and committed on - * "Apply" (so the table is not re-scanned on every keystroke). + * Self-contained: given only a `runId` it derives the run schema, the + * column value types, and the live scan status from atoms — so it can be + * dropped into the run header rather than sitting above the table. * - * Columns come from `buildFilterSchema` (run graph); column value types - * come from the evaluator output schema via `resolveValueType`, which - * drives the operator set and the value input. + * Follows the observability `Filters` pattern: a compact "Filters" button + * opens a popover holding the condition rows. Edits are staged in a draft + * and committed on "Apply". */ import {useMemo, useState} from "react" @@ -24,10 +23,17 @@ import { type RunSchema, } from "@agenta/entities/evaluationRun/etl" import {Button, Divider, Input, InputNumber, Popover, Select, Tooltip} from "antd" -import {useAtom} from "jotai" +import {useAtom, useAtomValue} from "jotai" import {Filter as FilterIcon, Loader2, Plus, X} from "lucide-react" -import {scenarioFilterAtomFamily, isConditionComplete} from "./scenarioFilterState" +import {evaluationRunQueryAtomFamily, tableColumnsAtomFamily} from "../atoms/table" + +import {buildColumnValueTypeResolver} from "./columnValueTypes" +import { + scenarioFilterAtomFamily, + isConditionComplete, + scenarioFilterStatusAtomFamily, +} from "./scenarioFilterState" const OP_LABELS: Record = { eq: "equals", @@ -79,31 +85,32 @@ const getWithinPopover = (trigger: HTMLElement) => export interface ScenarioFilterBarProps { runId: string - schema: RunSchema | null - /** Column value-type resolver, sourced from the evaluator output schema. */ - resolveValueType: (field: { - groupKind: string - groupSlug: string | null - columnName: string - }) => FilterValueType | undefined - /** True while the filter scan is still running. */ - scanning?: boolean - /** Confirmed matches found so far. */ - matchCount?: number } -const ScenarioFilterBar = ({ - runId, - schema, - resolveValueType, - scanning = false, - matchCount = 0, -}: ScenarioFilterBarProps) => { +const ScenarioFilterBar = ({runId}: ScenarioFilterBarProps) => { const [applied, setApplied] = useAtom(scenarioFilterAtomFamily(runId)) + const {matchCount, scanning} = useAtomValue(scenarioFilterStatusAtomFamily(runId)) const [open, setOpen] = useState(false) // Draft conditions edited inside the popover; committed on Apply. const [draft, setDraft] = useState(applied) + // Run schema (steps + mappings) — drives the filterable columns. + const runQuery = useAtomValue(useMemo(() => evaluationRunQueryAtomFamily(runId), [runId])) + const schema = useMemo(() => { + const data = runQuery.data?.rawRun?.data + const steps = data?.steps + const mappings = data?.mappings + if (!Array.isArray(steps) || !Array.isArray(mappings)) return null + return {steps, mappings} + }, [runQuery.data]) + + // Column value types — sourced from the evaluator output schemas. + const columnResult = useAtomValue(useMemo(() => tableColumnsAtomFamily(runId), [runId])) + const resolveValueType = useMemo( + () => buildColumnValueTypeResolver(columnResult), + [columnResult], + ) + const fields = useMemo( () => buildFilterSchema(schema, {resolveValueType}).fields.filter( @@ -275,7 +282,7 @@ const ScenarioFilterBar = ({ ) return ( -
+
({ /** True when at least one complete condition is set. */ export const isScenarioFilterActive = (group: PredicateGroup): boolean => group.conditions.some(isConditionComplete) + +/** Live scan status — written by the scenarios table, read by the filter bar. */ +export interface ScenarioFilterStatus { + /** Confirmed matches found so far. */ + matchCount: number + /** True while the filter scan is actively working. */ + scanning: boolean +} + +const EMPTY_STATUS: ScenarioFilterStatus = {matchCount: 0, scanning: false} + +/** + * Per-run filter scan status. The scenarios table runs the scan and + * writes this; the filter bar — which lives in the run header, a separate + * part of the component tree — reads it for its match-count indicator. + */ +export const scenarioFilterStatusAtomFamily = atomFamily((_runId: string) => + atom(EMPTY_STATUS), +) From cb69779a96dda2d593f09eee9fb95bacd917e97c Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 15:17:05 +0200 Subject: [PATCH 22/29] fix(oss): reposition the scenario filter per the updated mockup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The filter is now a compact icon-only funnel button with a condition- count badge, placed in the header's right group just before "Compare" (was a text "Filters" button + inline match-count on the left). The match-count / scanning indicator moves into the popover header ("N matches · scanning…" next to the title), so the closed affordance stays minimal. --- .../components/PreviewEvalRunHeader.tsx | 70 +++++++++---------- .../EvalRunDetails/etl/ScenarioFilterBar.tsx | 60 ++++++++-------- 2 files changed, 64 insertions(+), 66 deletions(-) diff --git a/web/oss/src/components/EvalRunDetails/components/PreviewEvalRunHeader.tsx b/web/oss/src/components/EvalRunDetails/components/PreviewEvalRunHeader.tsx index 6ab3163146..965029bb36 100644 --- a/web/oss/src/components/EvalRunDetails/components/PreviewEvalRunHeader.tsx +++ b/web/oss/src/components/EvalRunDetails/components/PreviewEvalRunHeader.tsx @@ -176,45 +176,38 @@ const PreviewEvalRunMeta = ({ return (
-
-
- Evaluations: -
- {runDescriptors.map((run, index) => { - const isBaseRun = index === 0 - const tagColor = getComparisonSolidColor(index) - return ( - - ) : undefined - } - onClose={ - !isBaseRun - ? (event) => { - event.preventDefault() - setCompareRunIds((prev) => - prev.filter((id) => id !== run.id), - ) - } - : undefined - } - /> - ) - })} -
+
+ Evaluations: +
+ {runDescriptors.map((run, index) => { + const isBaseRun = index === 0 + const tagColor = getComparisonSolidColor(index) + return ( + + ) : undefined + } + onClose={ + !isBaseRun + ? (event) => { + event.preventDefault() + setCompareRunIds((prev) => + prev.filter((id) => id !== run.id), + ) + } + : undefined + } + /> + ) + })}
- {activeView === "scenarios" ? ( -
- -
- ) : null}
@@ -230,6 +223,7 @@ const PreviewEvalRunMeta = ({ ) : null} + {activeView === "scenarios" ? : null}
diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx index 973742d188..de5ef6c8dd 100644 --- a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx @@ -164,7 +164,18 @@ const ScenarioFilterBar = ({runId}: ScenarioFilterBarProps) => { const popoverContent = (
-
Filter scenarios
+
+ Filter scenarios + {appliedCount > 0 ? ( + + {scanning ? : null} + + {matchCount} {matchCount === 1 ? "match" : "matches"} + {scanning ? " · scanning…" : ""} + + + ) : null} +
@@ -282,35 +293,28 @@ const ScenarioFilterBar = ({runId}: ScenarioFilterBarProps) => { ) return ( -
- + - - - {appliedCount > 0 && ( - - {scanning && } - - {matchCount} {matchCount === 1 ? "match" : "matches"} - {scanning ? " · scanning…" : ""} - + 0 ? "bg-zinc-700 text-white" : "bg-zinc-100 text-zinc-500" + }`} + > + {appliedCount} - )} -
+ + ) } From 842bc519ab30cabbaa66be52db36dae629bc9810 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 15:48:11 +0200 Subject: [PATCH 23/29] chore: retire the ETL scenarios PoC (T7) The /etl-poc test page proved the run-graph + molecule-cache strategy that Phase 1 + 2 shipped into the real scenarios table. Production has its own copies of the ported hooks, so the PoC is now dead test-page code. - delete components/EtlPocScenarios/ and the /etl-poc routes (oss + ee) - drop the /etl-poc branch from the Layout human-eval route check - mark T4 + T7 done in the design doc; Phase 3 (T5/T6/T8) remains --- docs/designs/eval-scenarios-table-integration.md | 13 ++++++++----- .../EvalRunDetails/etl/useHydrateScenarios.ts | 5 ++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/designs/eval-scenarios-table-integration.md b/docs/designs/eval-scenarios-table-integration.md index e914d84424..355436dcbb 100644 --- a/docs/designs/eval-scenarios-table-integration.md +++ b/docs/designs/eval-scenarios-table-integration.md @@ -279,15 +279,18 @@ co-consumers. - [ ] **Perf gate (P1)** — benchmark vs the old table, 1000+ scenarios, comparison on. **Gates T4.** **Phase 2 — filtering** -- [ ] **T4 (P1, human: ~3d / CC: ~half-day)** — `filterSchema` + **multi-predicate AND/OR** `filterTransform` + multi-condition predicate UI + viewport-fill loop; reuse `withRateLimitRetry`. Composition decided (D8). Gated on the Phase 1 perf gate. +- [x] **T4 (P1)** — multi-predicate AND/OR filtering (D8): `filterSchema` + `evaluateRowFilter` / `PredicateGroup` core (entities, unit-tested) + a popover filter bar in the run header + confirmed-match incremental rendering + viewport-fill loop. Column value types come from the evaluator output schema. v1 withholds testset/application columns behind a UI allowlist and `in`/`nin` operators from the UI. **Phase 3 — comparison, live, co-consumers** -- [ ] **T5 (P1, human: ~3d / CC: ~half-day)** — comparison build: compare-run schema fetch + per-run hydration + testcase_id join + export-path migration. +- [ ] **T5 (P1, human: ~3d / CC: ~half-day)** — comparison build: compare-run schema fetch + per-run hydration + testcase_id join + export-path migration. (Phase 1 ships best-effort: compare rows resolve against the base run's schema.) - [ ] **T6 (P2, human: ~1d / CC: ~2h)** — live updates: poll + page invalidation + human gap-fill. - [ ] **T8 (P1, human: ~1d / CC: ~2h)** — migrate focus drawer + `SingleScenarioViewer` off `useScenarioCellValue`; delete it. **Cleanup** -- [ ] **T7 (P2, human: ~1h / CC: ~10min)** — delete `EtlPocScenarios/` + `/etl-poc` routes once Phase 3 parity is verified. +- [x] **T7** — `EtlPocScenarios/` + `/etl-poc` routes (oss + ee) deleted. Done ahead of the Phase-3 gate at the maintainer's direction: production has its own copies of the ported hooks, so the PoC was dead test-page code. + +**Open / advisory** +- The **D5 perf gate** was not formally benchmarked — the table was QA'd functionally throughout Phase 1 + 2 instead. ## GSTACK REVIEW REPORT @@ -306,5 +309,5 @@ co-consumers. - **D7 (implementation-time finding):** reading `Table.tsx` showed the CSV export path (`exportResolveValue`, `columnLookupMap`, `loadAllPagesBeforeExport`) is keyed off `columnResult` column ids, which differ from `useEtlColumns` keys. **Phase 1 swaps display columns only** and keeps `usePreviewColumns`/`columnResult` alive for export; the old column path fully retires in Phase 3 with the export migration (T5). The "other"-column un-drop ripples into `ColumnLeaf`, `EtlResolvedCell`, and `useCellMaterialization`. - **D8 (Phase 2 decision):** filter composition resolved — **multi-predicate AND/OR from day 1**, not the PoC's single predicate. The predicate type generalises to a flat condition group; the filter bar reuses the observability multi-condition UI. - **UNRESOLVED:** 0 — filter composition closed (D8). No open decisions. -- **STATUS:** Phase 1 (T2+T3) implemented and committed. Next: the D5 perf gate, which gates T4. -- **VERDICT:** ENG + DESIGN REVIEW CLEARED — Phase 1 (T2+T3) shipped; T4 is multi-predicate, gated on the perf gate. +- **STATUS:** Phase 1 (T2+T3) and Phase 2 (T4, multi-predicate filtering) shipped. The PoC is retired (T7). Remaining: Phase 3 — T5 comparison, T6 live updates, T8 co-consumer migration (focus drawer + `SingleScenarioViewer` still on `useScenarioCellValue`). +- **VERDICT:** ENG + DESIGN REVIEW CLEARED — Phase 1 + Phase 2 shipped + PoC retired; Phase 3 (T5/T6/T8) remains. diff --git a/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts b/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts index ac8f1dbd02..560ea14eec 100644 --- a/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts +++ b/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts @@ -2,9 +2,8 @@ * useHydrateScenarios * * Watches the scenario rows the table has loaded and triggers a bulk - * hydrate pass per *new* page. Mirrors the ETL strategy proved by the - * `EtlPocScenarios` PoC: bulk requests per page, all entities populated - * together. + * hydrate pass per *new* page — bulk requests per page, all entities + * populated together. * * Flow per newly-seen scenario set: * 1. evaluationResultMolecule.actions.prefetchByScenarioIds → results From 39cf81b876067c2d4c96acc2b56577f57ee93d25 Mon Sep 17 00:00:00 2001 From: Arda Erzin Date: Fri, 22 May 2026 15:56:57 +0200 Subject: [PATCH 24/29] feat(oss): add in / not-in operators to the scenario filter The filter engine already supported in/nin; the popover now exposes them. A list operator shows a tag-style multi-value input (comma / Enter to add entries), numeric columns coerce entries to numbers, and a condition with an empty list counts as incomplete. Switching a condition between a scalar and a list operator resets its value to the right shape. --- .../EvalRunDetails/etl/ScenarioFilterBar.tsx | 49 +++++++++++++++++-- .../EvalRunDetails/etl/scenarioFilterState.ts | 8 ++- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx index de5ef6c8dd..bed7a7d6a7 100644 --- a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx +++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx @@ -46,9 +46,12 @@ const OP_LABELS: Record = { nin: "not in", } -// Operators offered in the UI. `in` / `nin` are supported by the filter -// engine but need an array-value input — deferred from this v1 bar. -const UI_OPERATORS: FilterOperator[] = ["eq", "ne", "lt", "lte", "gt", "gte"] +// Operators offered in the UI. `in` / `nin` take a list of values (a tag +// input); the rest take a single value. +const UI_OPERATORS: FilterOperator[] = ["eq", "ne", "lt", "lte", "gt", "gte", "in", "nin"] + +/** True for operators whose value is a list rather than a scalar. */ +const isListOperator = (op: FilterOperator) => op === "in" || op === "nin" /** * v1 column-kind allowlist for filtering. Only metric-related columns @@ -244,9 +247,19 @@ const ScenarioFilterBar = ({runId}: ScenarioFilterBarProps) => { disabled={!field} options={ops.map((o) => ({value: o, label: OP_LABELS[o]}))} getPopupContainer={getWithinPopover} - onChange={(op) => updateCondition(index, {op})} + onChange={(op) => { + // Switching between scalar and list + // operators changes the value shape — + // reset it so it stays valid. + const isList = isListOperator(op) + const wasList = Array.isArray(condition.value) + const value = + isList === wasList ? condition.value : isList ? [] : "" + updateCondition(index, {op, value}) + }} /> { ) } -/** Value input — shape depends on the field's (best-effort) value type. */ +/** Value input — shape depends on the operator and the field value type. */ const ConditionValueInput = ({ + op, valueType, value, disabled, onChange, }: { + op: FilterOperator valueType: FilterValueType value: unknown disabled: boolean onChange: (value: unknown) => void }) => { + // `in` / `nin` — a list of values entered as tags. + if (isListOperator(op)) { + const tags = Array.isArray(value) ? value.map((v) => String(v)) : [] + return ( +