diff --git a/docs/designs/eval-scenarios-table-integration.md b/docs/designs/eval-scenarios-table-integration.md
new file mode 100644
index 0000000000..d39ed7265e
--- /dev/null
+++ b/docs/designs/eval-scenarios-table-integration.md
@@ -0,0 +1,314 @@
+# Eval Scenarios Table — ETL Integration
+
+**Created:** 2026-05-22
+**Status:** RFC — Eng-reviewed; ready to implement (Phase 1)
+**Related:** [eval-etl-engine](./eval-etl-engine.md), [etl-engine](./etl-engine.md), [eval-filtering](./eval-filtering.md)
+**Authors:** Arda
+
+---
+
+## Summary
+
+The `EtlPocScenarios` PoC (the `/etl-poc` page) showed an evaluation run
+scenarios table can be fast with a specific strategy: thin rows (identity
+only), page-level bulk hydration into molecule caches, self-resolving cells,
+and ETL-engine-backed predicate filtering.
+
+This doc covers folding that strategy into the **real** eval run scenarios
+table (`EvalRunDetails/Table.tsx`) and retiring the PoC. It is **phased** — a
+core production table never big-bangs.
+
+> **Eng review reframe.** The first draft called the data-layer swap a
+> "low-risk mechanical port." That was wrong. The PoC only ever ran against
+> *finished* runs — it fabricates `scenario: {status: "success"}`
+> (`EtlResolvedCell.tsx:135`, `index.tsx:692`). Rendering pending / running /
+> failed scenarios and a real "skeleton while pending" policy is **unbuilt
+> design work**, now scoped into Phase 1. See [Resolved decisions](#resolved-decisions).
+
+---
+
+## Current state — two implementations of one table
+
+| | Production (`EvalRunDetails/Table.tsx`) | PoC (`EtlPocScenarios`) |
+|---|---|---|
+| Store | `evaluationPreviewTableStore` — semi-full rows | `scenarioThinPaginatedStore` — `{key, id, scenarioId}` |
+| Columns | backend metadata (`usePreviewColumns`) | run graph (`useEtlColumns` → `resolveMappings`) |
+| Cell data | per-visible-cell fetch (`useScenarioCellValue`) | `EtlResolvedCell` from molecule caches; `useHydrateScenarios` bulk-fills per page |
+| Filtering | none | predicate bar + ETL viewport-fill loop |
+| Comparison | interleaved rows, 2-4 runs | single run |
+| Live runs | 5s run-status poll; 15-30s `staleTime`; human-only metrics gap-fill | none — assumes terminal data |
+| Fetch path | `fetchEvaluationScenarioWindow` | **same** `fetchEvaluationScenarioWindow` |
+
+`scenarioPaginatedStore.ts`'s own header states the intent: *"replace
+`evaluationPreviewTableStore` with this once the scenarios view is on the
+molecule-cache pattern."* This is that project.
+
+---
+
+## Resolved decisions
+
+| # | Decision |
+|---|----------|
+| **Hydration shape** | Thin rows + self-hydrating cells. The thin row carries identity + `testcaseId` (comparison join key) + `status` (live-update + skeleton) — never column data. |
+| **Column source** | Run graph (`data.steps` + `data.mappings`) via `resolveMappings`. **Correction:** `useEtlColumns` currently drops `group.kind === "other"` columns (`useEtlColumns.tsx:56`, "skip in the test page"). Production must keep them — that shortcut is removed in T2. |
+| **Cell caching** | Same molecules over the same TanStack layer; no net regression *expected* — validated, not asserted, by the Phase 1 perf gate. |
+| **D1 — phasing** | Phase the migration. Phase 1: data-layer swap. Phase 2: filtering. Phase 3: comparison + live + co-consumers. Then retire the PoC. Each phase reviewable and revertable; the table works between phases. |
+| **D2 — comparison display** | Interleaved rows (today's model), not testcase-aligned columns. Compare-mode column set = shared testcase inputs + the **common-evaluator intersection** across compared runs + the standard invocation output. Reuses single-run column derivation. |
+| **D3 — live updates** | Match production's modest bar: run-status poll + page invalidation while non-terminal + human-eval metrics gap-fill. No real scenario streaming. |
+| **D5 — perf gate** | After Phase 1, benchmark the new table vs the current `useScenarioCellValue` table on a 1000+ scenario run with comparison on. A regression stops Phase 2. |
+| **D8 — filter composition** | **Multi-predicate from day 1.** Phase 2 ships multi-condition AND/OR filtering, not the PoC's single predicate. The predicate type generalises to a condition *group* (`{op: "and" \| "or", conditions: RowPredicate[]}`); the filter bar reuses the observability multi-condition filter UI. Closed at Phase 2 start (was the one open decision). |
+
+**No open decisions.**
+
+---
+
+## Architecture — target
+
+The production table adopts the PoC strategy in place. Four layers (per
+`eval-etl-engine.md`):
+
+```
+EvalRunDetails table (OSS UI)
+  └── thin scenarios store ──> ETL filter pipeline ──> rendered viewport
+        (identity + join keys)   (runLoop + filterTransform)   (InfiniteVirtualTable)
+                                        │
+        cells self-resolve from ────────┘
+        molecule caches (results / metrics / testcases / traces)
+        bulk-hydrated per page; cell-materialized on demand
+```
+
+- **Source** — the thin scenarios store; reuses `fetchEvaluationScenarioWindow`.
+- **Transform** — the eval-specific `filterTransform`. "Skeleton while
+  pending" for rows whose slices are not yet hydrated **or whose scenario has
+  not yet run** — these two cases are distinct and both must be handled.
+- **Sink** — the rendered viewport; the loop runs until the viewport fills.
+- **Cells** — `EtlResolvedCell`, resolving from molecule caches.
+
+---
+
+## Phase 1 — column + cell swap
+
+The table's internals, table-only. The focus drawer and `SingleScenarioViewer`
+stay on `useScenarioCellValue` (kept alive) until Phase 3.
+
+> **Implementation-time finding — T1 dropped.** Reading
+> `evaluationPreviewTableStore.ts` confirmed it is *already* a thin store:
+> `PreviewTableRow` carries only identity + `testcaseId` + `status` +
+> `scenarioIndex` + comparison fields — zero column data — and it already does
+> per-eval-type window order (line 114). The PoC's separate
+> `scenarioThinPaginatedStore` exists only to drop a couple of cheap unused
+> fields. **The store stays as-is — there is no T1.** The eng-review outside
+> voice flagged this; a direct read confirmed it.
+
+Phase 1 is the **column + cell swap** in `Table.tsx`. T2 and T3 are **coupled**
+— a column definition carries its own cell `render` function, so the column
+source and the cell renderer swap together in one change.
+
+- **T2 — schema columns (display only).** Wire `useEtlColumns` /
+  `resolveMappings` into `Table.tsx` for the **rendered** columns. **Remove the
+  "other"-column drop** (`useEtlColumns.tsx:56`) so the visible set matches
+  today — note this ripples: `ColumnLeaf["kind"]`, `EtlResolvedCell`'s
+  `columnKind`, and `useCellMaterialization`'s slice map all need an "other"
+  case. `usePreviewColumns` / `usePreviewTableData` / `columnResult` **stay
+  alive** — the CSV export (`exportResolveValue`, `columnLookupMap`) is keyed
+  off `columnResult` ids, which differ from `useEtlColumns` keys. Full
+  retirement of the old column path moves to Phase 3 with the export
+  migration (T5). Two column systems coexist transitionally (accepted under D1).
+- **T3 — self-hydrating cells + non-terminal rendering.** `EtlResolvedCell` +
+  `useHydrateScenarios` + `useCellMaterialization`, against the existing
+  `evaluationPreviewTableStore` rows (keyed by `scenarioId`). **Not** purely
+  mechanical: add real rendering for pending / running / failed / partial
+  scenarios, and a "skeleton while pending" policy that distinguishes
+  *slice-not-hydrated* from *scenario-not-run*. The PoC's `status: "success"`
+  fabrication is removed.
+
+**Perf gate (D5)** — after T2+T3 land: benchmark the new table against the
+current one on a 1000+ scenario run, comparison on. Regression → stop, rethink.
+
+---
+
+## Phase 2 — filtering
+
+- **T4 — multi-predicate filtering (D8).** Ships multi-condition AND/OR
+  from day 1 — not the PoC's single predicate. `filterSchema` derives
+  filterable fields: columns → evaluator steps → evaluator output schemas
+  → typed fields + type-matched operators (`eval-filtering.md` D4). The
+  predicate generalises from `RowPredicate` to a condition *group*
+  (`{op: "and" | "or", conditions: RowPredicate[]}`, one nesting level for
+  v1 — flat AND/OR, no arbitrary trees); `predicateToEntitySlices` takes
+  the union of every condition's slices. The `filterTransform` evaluates
+  the group per row against hydrated metrics; the loop runs until the
+  viewport fills. The filter bar reuses the observability multi-condition
+  filter UI. **Reuse `withRateLimitRetry`** for the scan — a low-hit-ratio
+  filter scans many scenario + metric pages and EE throttling will 429 it
+  (the batch-add lesson).
+
+---
+
+## Phase 3 — comparison, live updates, co-consumers
+
+- **T5 — comparison.** A **build, not a port** — the PoC has zero multi-run
+  code. Per compared run: a second store scope, a **schema fetch** (needed for
+  the common-evaluator intersection), and per-run hydration of result slices
+  (`testcase_id` lives on results). Then align compared runs to the *filtered*
+  main rows by `testcase_id` (the `mergedRows` logic exists in production —
+  port it over the thin/cache model). The **CSV export path**
+  (`Table.tsx:542`) rebuilds the merge logic and migrates here too.
+- **T6 — live updates.** Run-status poll + non-terminal page invalidation +
+  human-eval metrics gap-fill.
+- **T8 — co-consumer migration.** Migrate the focus drawer (`focusDrawerAtom`,
+  `FocusDrawerHeader`, `FocusDrawerSidePanel`, `FocusDrawer`) and
+  `SingleScenarioViewerPOC` off `useScenarioCellValue` + `evaluationPreviewTableStore`,
+  then delete `useScenarioCellValue`.
+
+---
+
+## Retire the PoC
+
+- **T7** — delete `EtlPocScenarios/` and the `/etl-poc` routes (oss + ee).
+  Gated on Phase 3 parity verified.
+
+---
+
+## Design — interaction states & filter UX
+
+From the design review (focused — the migration preserves the table's visual
+design; these are the genuinely new design surfaces).
+
+**Cell states:**
+- *Skeleton (not hydrated)* — reuse the PoC's `EtlSkeletonCell`: a fixed-height
+  placeholder bar, identical row height to a populated row, so there is no
+  layout jump when data lands.
+- *Non-terminal scenarios (running / failed / pending)* — match production's
+  existing live-table rendering. Hard rule: a *running* cell must read as
+  in-progress, visually distinct from a missing value — never a bare "—" for a
+  running scenario (the user must be able to tell "computed nothing" from
+  "still computing").
+
+**Filter states:**
+- *Scanning* — a live hit-ratio counter ("Scanned N / matched M", from
+  `hitRatioAtom`), not a silent spinner. A picky filter scans thousands of
+  scenarios to surface a few; the counter explains the wait and keeps trust.
+- *No match* — a real empty state: "No scenarios match this filter" + a
+  one-click **Clear filter** action. Not "No items found."
+- *Rate-limited / scan failed* — keep the partial viewport visible with a
+  non-blocking inline indicator ("Filtering paused — retrying…"). Never a
+  blocking overlay.
+
+**Filter bar** — lives in the eval run details header row, following the
+observability `Filters` placement. Multi-predicate AND/OR composition (D8) —
+reuses the observability multi-condition filter UI.
+
+## Test plan
+
+```
+PLANNED COMPONENT                                    TESTS
+T1 thin store          [GAP] page fetch → skeleton+merge; per-eval-type order
+T2 schema columns      [GAP][CRITICAL][REGRESSION] resolveMappings column set
+                              == usePreviewColumns for auto/human/online,
+                              "other" columns INCLUDED — before deleting the old path
+T3 cells               [GAP] resolve from caches; pending/running/failed render
+                       [GAP] skeleton-while-pending: not-hydrated vs not-run
+T4 filtering           [GAP] filterSchema typed fields; multi-predicate
+                              AND/OR filterTransform — match/no-match/pending
+                              + group semantics; [→E2E] multi-condition → rows
+T5 comparison          [GAP] compare-run schema fetch; testcase_id join;
+                              common-evaluator intersection; [→E2E] compare+filter
+T6 live updates        [GAP] poll stops at terminal; page invalidation; gap-fill
+T8 co-consumers        [VERIFIED] focus drawer + scenario viewer render
+                              unchanged — independent old data path (D9)
+```
+
+Pure logic — `filterTransform`, `filterSchema` derivation, the comparison
+testcase-join — unit-tests in `@agenta/entities` vitest (the batch-add
+harness). The two `[→E2E]` flows go to Playwright. The T2 and T8 **regression
+guards are mandatory** — they protect a user-visible column set and two live
+co-consumers.
+
+---
+
+## Edge cases & constraints
+
+- **Non-terminal scenarios** — pending / running / failed / partial rows must
+  render; the PoC never did. Distinguish slice-not-hydrated from
+  scenario-not-run.
+- **"Other" columns** — `useEtlColumns` must keep them (T2).
+- **Eval types** — auto / human / online all derive columns from the run
+  schema; online fetches `descending`.
+- **Filtered + comparison** — filtering the main run re-drives compare
+  alignment; a filtered-out main row drops its compare group.
+- **Filter scan throttling** — reuse `withRateLimitRetry` (T4).
+- **Large-run memory** — within a run, bulk-hydrate fills caches;
+  `useScopeChangeEviction` only evicts on run change. Verify at 10k scenarios;
+  per-chunk eviction exists if needed.
+- **Testset mismatch** — compared runs not sharing the main testset produce an
+  empty testcase join (the candidate filter already guards this).
+- **`EvalRunDetails` / `EvalRunDetails2` split** — do not deepen it; touched
+  code consolidates into `EvalRunDetails/`.
+
+---
+
+## NOT in scope
+
+- **Testcase-aligned comparison columns** — interleaved rows for now (D2).
+- **Real scenario streaming** — match production's poll bar (D3).
+- **Cross-run filter predicates** ("main high, run B regressed") — filter the
+  main run only; cross-run is a v2 feature.
+- **Consolidating `EvalRunDetails2`** — only the touched code moves.
+- **Backend filter param** — client-side filter is v1 (`eval-filtering.md`).
+
+## What already exists (reused, not rebuilt)
+
+- The ETL engine + generic primitives (`@agenta/entities/etl`) — built and
+  tested this session.
+- `evaluationPreviewTableStore` — already a thin store (identity + `status`,
+  no column data, per-eval-type order); kept as-is, no swap needed (T1 dropped).
+- `fetchEvaluationScenarioWindow` — the scenario fetch; reused unchanged.
+- `mergedRows` testcase_id-join alignment — ported, not reinvented.
+- `withRateLimitRetry` — reused for the filter scan.
+- The PoC's `useHydrateScenarios` / `useEtlColumns` / `EtlResolvedCell` /
+  `useCellMaterialization` — ported (with the corrections above).
+
+---
+
+## Implementation tasks
+
+**Phase 1 — column + cell swap** (T1 dropped — `evaluationPreviewTableStore` is already thin; see Phase 1)
+- [x] **T2 (P1)** — schema columns for the **rendered** table; keeps "other" columns; **column-parity regression test** (`groupRunColumns.test.ts`). `usePreviewColumns`/`columnResult` kept alive for the export path. Landed with T3.
+- [x] **T3 (P1)** — self-hydrating cells **plus non-terminal scenario rendering + skeleton-while-pending**. Landed with T2.
+- [ ] **Perf gate (P1)** — benchmark vs the old table, 1000+ scenarios, comparison on. **Gates T4.**
+
+**Phase 2 — filtering**
+- [x] **T4 (P1)** — multi-predicate AND/OR filtering (D8): `filterSchema` + `evaluateRowFilter` / `PredicateGroup` core (entities, unit-tested) + a popover filter bar in the run header + confirmed-match incremental rendering + viewport-fill loop. Column value types come from the evaluator output schema. v1 withholds testset/application columns behind a UI allowlist and `in`/`nin` operators from the UI.
+
+**Phase 3 — comparison, live, co-consumers**
+- [x] **T5 (P1)** — comparison: testcase-id join on the filtered base run; compare runs are eagerly paged while a filter is active so each matched base row finds its counterpart. Compare rows resolve against the base run's schema (best-effort, per the Phase-1 note).
+- [x] **T6 (P2)** — live updates (`useScenarioLiveUpdates`): while the run is non-terminal, periodically refetch the loaded scenario pages (row statuses) and evict + re-prefetch the results / metrics molecule caches of running / just-finished scenarios; one final pass at terminal, then stop.
+- [x] **T8 (P1)** — verified: the focus drawer + `SingleScenarioViewer` are **not regressed** by the cell swap — both run on the fully-preserved, independent old data path (`scenarioColumnValues.ts` + its dependency atoms), fetching their own values regardless of what the table renders. Full ETL migration is **deferred** (see D9): `useScenarioCellValue` cannot be deleted while the static invocation-metrics group (kept in the table, D7) and the CSV export both still depend on the old-path cells.
+
+**Cleanup**
+- [x] **T7** — `EtlPocScenarios/` + `/etl-poc` routes (oss + ee) deleted. Done ahead of the Phase-3 gate at the maintainer's direction: production has its own copies of the ported hooks, so the PoC was dead test-page code.
+
+**Open / advisory**
+- The **D5 perf gate** was not formally benchmarked — the table was QA'd functionally throughout Phase 1 + 2 instead.
+
+## GSTACK REVIEW REPORT
+
+| Review | Trigger | Why | Runs | Status | Findings |
+|--------|---------|-----|------|--------|----------|
+| CEO Review | `/plan-ceo-review` | Scope & strategy | 0 | — | — |
+| Codex Review | `/codex review` | Independent 2nd opinion | 0 | — | — |
+| Eng Review | `/plan-eng-review` | Architecture & tests (required) | 1 | clean | 5 decisions resolved, 0 critical gaps open |
+| Design Review | `/plan-design-review` | UI/UX gaps | 1 | clean | 5/10 → 9/10, 2 decisions, focused on states |
+| DX Review | `/plan-devex-review` | Developer experience gaps | 0 | — | — |
+
+- **OUTSIDE VOICE (eng):** Claude subagent — caught 5 real gaps the section review under-weighted: T1-T3 mislabeled as "low-risk mechanical port" (non-terminal rendering is unbuilt), `useEtlColumns` drops "other" columns (guaranteed regression), the perf premise was asserted not measured (→ D5 perf gate), T5 comparison is a build not a port (+ unlisted compare-schema fetch), the CSV export path was missed. All folded into the plan.
+- **ENG DECISIONS:** D1 phase the migration · D2 interleaved rows + common-evaluator intersection columns · D3 match production's live bar · D4 outside voice ran · D5 perf-validation gate after Phase 1.
+- **DESIGN DECISIONS:** focused review (migration preserves the visual design) · live hit-ratio counter for filter scanning · interaction-state specs added (skeleton, non-terminal cells, filter no-match empty state, rate-limited indicator).
+- **D6 (implementation-time finding):** starting Phase 1 confirmed `evaluationPreviewTableStore` is already a thin store (identity + status, no column data, per-eval-type order). **T1 is dropped** — Phase 1 is the coupled T2+T3 column+cell swap against the existing store. Confirms the eng-review outside voice's "T1 re-implements an existing store" point.
+- **D7 (implementation-time finding):** reading `Table.tsx` showed the CSV export path (`exportResolveValue`, `columnLookupMap`, `loadAllPagesBeforeExport`) is keyed off `columnResult` column ids, which differ from `useEtlColumns` keys. **Phase 1 swaps display columns only** and keeps `usePreviewColumns`/`columnResult` alive for export; the old column path fully retires in Phase 3 with the export migration (T5). The "other"-column un-drop ripples into `ColumnLeaf`, `EtlResolvedCell`, and `useCellMaterialization`.
+- **D8 (Phase 2 decision):** filter composition resolved — **multi-predicate AND/OR from day 1**, not the PoC's single predicate. The predicate type generalises to a flat condition group; the filter bar reuses the observability multi-condition UI.
+- **D9 (implementation-time finding):** **T8 co-consumers verified, full migration deferred.** Tracing `scenarioColumnValues.ts` and `SingleScenarioViewerPOC` confirmed the focus drawer and `SingleScenarioViewer` resolve their values through the old data path's independent atom families — they do not depend on the table's cells and so are **not regressed** by the T2+T3 cell swap. The design's "delete `useScenarioCellValue`" goal is blocked: that hook still backs `MetricCell`/`InputCell`/`InvocationCell`, which render the static invocation-metrics group kept in the production table (`metricGroupKeys`, D7) and feed the CSV export. A full ETL rebuild of the 1551-line `FocusDrawer` (incl. compare mode) is out of proportion to "no regression to fix" and would not enable the deletion. T8 closes as verified; the migration moves to the eventual old-column-path retirement.
+- **UNRESOLVED:** 0 — filter composition closed (D8). No open decisions.
+- **STATUS:** Phases 1–3 shipped — T2+T3 column/cell swap, T4 multi-predicate filtering, T5 comparison (testcase-id join), T6 live updates, T7 PoC retired, T8 co-consumers verified (no regression; full migration deferred per D9). Feature complete on `fe-experiment/etl-eval-scenario-filtering`.
+- **VERDICT:** ENG + DESIGN REVIEW CLEARED — all phases shipped.
diff --git a/web/oss/src/components/EvalRunDetails/Table.tsx b/web/oss/src/components/EvalRunDetails/Table.tsx
index bc5d6560e4..a5ef706696 100644
--- a/web/oss/src/components/EvalRunDetails/Table.tsx
+++ b/web/oss/src/components/EvalRunDetails/Table.tsx
@@ -1,8 +1,9 @@
-import {useCallback, useMemo, useRef} from "react"
+import {useCallback, useEffect, useMemo, useRef} from "react"
 
+import type {RunSchema} from "@agenta/entities/evaluationRun/etl"
 import {message} from "@agenta/ui/app-message"
 import clsx from "clsx"
-import {useAtomValue, useStore} from "jotai"
+import {useAtomValue, useSetAtom, useStore} from "jotai"
 
 import VirtualizedScenarioTableAnnotateDrawer from "@/oss/components/EvalRunDetails/components/AnnotateDrawer/VirtualizedScenarioTableAnnotateDrawer"
 import {
@@ -19,11 +20,20 @@ import {
 import useComparisonPaginations from "../EvalRunDetails2/hooks/useComparisonPaginations"
 
 import {MAX_COMPARISON_RUNS, compareRunIdsAtom, getComparisonColor} from "./atoms/compare"
+import {effectiveProjectIdAtom} from "./atoms/run"
 import {runDisplayNameAtomFamily} from "./atoms/runDerived"
 import type {EvaluationTableColumn} from "./atoms/table"
-import {DEFAULT_SCENARIO_PAGE_SIZE} from "./atoms/table"
+import {DEFAULT_SCENARIO_PAGE_SIZE, evaluationRunQueryAtomFamily} from "./atoms/table"
 import type {PreviewTableRow} from "./atoms/tableRows"
 import ScenarioColumnVisibilityPopoverContent from "./components/columnVisibility/ColumnVisibilityPopoverContent"
+import {CellMaterializerContext} from "./etl/cellMaterializerContext"
+import {scenarioFilterStatusAtomFamily} from "./etl/scenarioFilterState"
+import {useCellMaterialization} from "./etl/useCellMaterialization"
+import {useEtlColumns} from "./etl/useEtlColumns"
+import {useHydrateScenarios} from "./etl/useHydrateScenarios"
+import {useScenarioFilter} from "./etl/useScenarioFilter"
+import {useScenarioLiveUpdates} from "./etl/useScenarioLiveUpdates"
+import {useScopeChangeEviction} from "./etl/useScopeChangeEviction"
 import {
     evaluationPreviewDatasetStore,
     evaluationPreviewTableStore,
@@ -87,6 +97,156 @@ const EvalRunDetailsTable = ({
 
     const previewColumns = usePreviewColumns({columnResult, evaluationType})
 
+    // ── ETL schema columns + self-hydrating cells (Phase 1 — T2 + T3) ──
+    // The schema columns (testset / application / evaluator / metrics /
+    // other) are derived from the run graph and rendered by cells that
+    // resolve from molecule caches. `usePreviewColumns` / `columnResult`
+    // stay alive above for the CSV export path (keyed off `columnResult`
+    // ids) — only the *display* columns swap here.
+    const effectiveProjectId = useAtomValue(effectiveProjectIdAtom)
+    const projectId = _projectId ?? effectiveProjectId
+
+    const runQuery = useAtomValue(useMemo(() => evaluationRunQueryAtomFamily(runId), [runId]))
+    // Run-level status — drives the live-update loop below. While the run
+    // is non-terminal completed scenarios must refresh; once terminal the
+    // loop stops.
+    const runStatus = useMemo<string | null>(
+        () => runQuery.data?.rawRun?.status ?? runQuery.data?.camelRun?.status ?? null,
+        [runQuery.data],
+    )
+    const runSchema = useMemo<RunSchema | null>(() => {
+        const data = runQuery.data?.rawRun?.data
+        const steps = data?.steps
+        const mappings = data?.mappings
+        if (!Array.isArray(steps) || !Array.isArray(mappings)) return null
+        return {steps, mappings}
+    }, [runQuery.data])
+
+    // Phase 2 — multi-predicate filtering (D8). Filters the base run's
+    // rows; each comparison group follows its base row.
+    const {filteredBaseRows, effectiveFilter, active, confirmedMatchCount, isFilling} =
+        useScenarioFilter({
+            projectId,
+            runId,
+            schema: runSchema,
+            baseRows: basePagination.rows,
+            loadNextPage: basePagination.loadNextPage,
+            hasMore: basePagination.paginationInfo.hasMore,
+            isFetching: basePagination.paginationInfo.isFetching,
+        })
+
+    // Comparison + filter: the viewport-fill loop scans base pages, but a
+    // strict filter means the table may never scroll — so compare runs
+    // would stay on page 1 and the testcase-id join in `mergedRows` would
+    // miss most counterparts. While a filter is active, eagerly load every
+    // compare-run page so each matched base row can find its counterpart.
+    useEffect(() => {
+        if (!active) return
+        comparePaginations.forEach((pagination, idx) => {
+            if (!compareSlots[idx]) return
+            const {hasMore, isFetching} = pagination.paginationInfo
+            if (hasMore && !isFetching) pagination.loadNextPage()
+        })
+    }, [active, comparePaginations, compareSlots])
+
+    const etlColumns = useEtlColumns({projectId, runId, schema: runSchema})
+
+    // Page-level hydrate — predicate-aware: with an active filter it
+    // fetches the entity slices the filter needs to be evaluated; with no
+    // filter it is inert and cells materialize their own visible data.
+    const hydration = useHydrateScenarios({
+        projectId,
+        runId,
+        rows: basePagination.rows,
+        schema: runSchema,
+        predicate: effectiveFilter,
+        sliceMode: "auto",
+    })
+
+    // The filter scan is actively working — the viewport-fill loop still
+    // wants more pages (`isFilling`), a page is being fetched, or a
+    // hydrate batch is in flight. Once enough matches are found the loop
+    // stops, so this goes false even though the dataset has more pages
+    // (`hasMore`) — it only shows "scanning" while real work is happening.
+    const scanInProgress =
+        isFilling || (active && (basePagination.paginationInfo.isFetching || hydration.isHydrating))
+    // Nothing has confirmed-matched yet — show the full empty + loading
+    // overlay. Once the first match lands, rows show and grow (no overlay,
+    // no flicker — `filteredBaseRows` only ever grows during a scan).
+    const isScanning = scanInProgress && confirmedMatchCount === 0
+
+    // Publish the scan status so the filter bar — which lives in the run
+    // header, a separate part of the tree — can show the match count.
+    const setFilterStatus = useSetAtom(
+        useMemo(() => scenarioFilterStatusAtomFamily(runId), [runId]),
+    )
+    useEffect(() => {
+        setFilterStatus({matchCount: confirmedMatchCount, scanning: scanInProgress})
+    }, [setFilterStatus, confirmedMatchCount, scanInProgress])
+
+    // Cell-side lazy materializer — coalesces visible cells' slice
+    // requests into one bulk fetch per (slice, run).
+    const cellMaterializer = useCellMaterialization({projectId, runId})
+
+    // Evict molecule caches written for the outgoing run on scope change.
+    useScopeChangeEviction({projectId, runId})
+
+    // Live updates (T6) — while the run is executing, periodically refetch
+    // the loaded scenario pages (row statuses) and refresh the molecule
+    // caches of running / just-finished scenarios so completed cells leave
+    // the "Running" state. Inert once the run is terminal.
+    useScenarioLiveUpdates({projectId, runId, runStatus, pageSize})
+
+    // Production metric-group ids. The scenario table's "Metrics" group is
+    // the static invocation metrics (cost / duration / tokens) — injected
+    // by the backend-metadata path, not run-mapping-derived — so it is kept
+    // as-is rather than replaced by ETL columns.
+    const metricGroupKeys = useMemo(
+        () =>
+            new Set(
+                (columnResult?.groups ?? [])
+                    .filter((g) => g.kind === "metric")
+                    .map((g) => String(g.id)),
+            ),
+        [columnResult?.groups],
+    )
+
+    // Final rendered column set: production meta columns (index / status,
+    // timestamp, action), the column-visibility trigger, and the static
+    // metric group(s) are kept; the testset / application / evaluator /
+    // other schema groups are replaced by the ETL-derived ones. While the
+    // run schema is still loading, the production columns are used whole
+    // (their skeleton groups cover the gap).
+    const tableColumns = useMemo(() => {
+        const src = previewColumns.columns
+        if (!runSchema || etlColumns.length === 0) return src
+        const out: typeof src = []
+        let inserted = false
+        for (const col of src) {
+            const children = (col as {children?: unknown[]}).children
+            const isGroup = Array.isArray(children) && children.length > 0
+            const key = String((col as {key?: unknown}).key ?? "")
+            const isMetricGroup = isGroup && metricGroupKeys.has(key)
+            if (isGroup && !isMetricGroup) {
+                if (!inserted) {
+                    out.push(...(etlColumns as typeof src))
+                    inserted = true
+                }
+                // drop the production schema group column (replaced by ETL)
+            } else {
+                // keep: meta / visibility columns AND static metric groups
+                out.push(col)
+            }
+        }
+        if (!inserted) {
+            // No replaceable production group columns — insert ETL groups
+            // before the trailing column-visibility trigger.
+            const at = Math.max(out.length - 1, 0)
+            out.splice(at, 0, ...(etlColumns as typeof src))
+        }
+        return out
+    }, [previewColumns.columns, etlColumns, runSchema, metricGroupKeys])
+
     // Inject synthetic columns for comparison exports (do not render in UI)
     const exportColumns = useMemo(() => {
         const hasCompareRuns = compareSlots.some(Boolean)
@@ -125,7 +285,7 @@ const EvalRunDetailsTable = ({
 
     const mergedRows = useMemo(() => {
         if (!compareSlots.some(Boolean)) {
-            return basePagination.rows.map((row) => ({
+            return filteredBaseRows.map((row) => ({
                 ...row,
                 baseScenarioId: row.scenarioId ?? row.id,
                 compareIndex: 0,
@@ -133,7 +293,7 @@ const EvalRunDetailsTable = ({
             }))
         }
 
-        const baseRows = basePagination.rows.map((row) => ({
+        const baseRows = filteredBaseRows.map((row) => ({
             ...row,
             baseScenarioId: row.scenarioId ?? row.id,
             compareIndex: 0,
@@ -215,7 +375,7 @@ const EvalRunDetailsTable = ({
         })
 
         return result
-    }, [basePagination.rows, compareSlots, compareRowsBySlot])
+    }, [filteredBaseRows, compareSlots, compareRowsBySlot])
 
     const handleRowClick = useCallback(
         (record: TableRowData) => {
@@ -276,6 +436,9 @@ const EvalRunDetailsTable = ({
 
     const paginationForShell = useMemo<TableFeaturePagination<TableRowData>>(
         () => ({
+            // `mergedRows` is monotonic during a scan (confirmed matches
+            // only), so it can be handed to the table directly — no empty
+            // placeholder swap needed.
             rows: mergedRows,
             loadNextPage: handleLoadMore,
             resetPages: handleResetPages,
@@ -828,74 +991,94 @@ const EvalRunDetailsTable = ({
     const hasCompareRuns = compareSlots.some(Boolean)
 
     return (
-        <section className="bg-zinc-1 w-full h-full overflow-hidden flex flex-col px-2">
-            <div className="w-full grow min-h-0 overflow-auto">
-                <InfiniteVirtualTableFeatureShell<TableRowData>
-                    datasetStore={evaluationPreviewDatasetStore}
-                    tableScope={tableScope}
-                    store={store}
-                    columns={previewColumns.columns}
-                    rowKey={(record) => record.key}
-                    tableClassName={clsx(
-                        "agenta-scenario-table",
-                        `agenta-scenario-table--row-${rowHeight}`,
-                    )}
-                    resizableColumns
-                    useSettingsDropdown
-                    settingsDropdownMenuItems={rowHeightMenuItems}
-                    columnVisibilityMenuRenderer={(
-                        controls,
-                        close,
-                        {scopeId, onExport, isExporting},
-                    ) => (
-                        <ScenarioColumnVisibilityPopoverContent
-                            controls={controls}
-                            onClose={close}
-                            scopeId={scopeId}
-                            runId={runId}
-                            evaluationType={evaluationType}
-                            onExport={onExport}
-                            isExporting={isExporting}
-                        />
-                    )}
-                    pagination={paginationForShell}
-                    exportOptions={exportOptions}
-                    tableProps={{
-                        rowClassName: (record) =>
-                            clsx("scenario-row", {
-                                "scenario-row--comparison": record.isComparisonRow,
-                            }),
-                        size: "small",
-                        sticky: true,
-                        virtual: true,
-                        bordered: true,
-                        tableLayout: "fixed",
-                        onRow: (record) => {
-                            const backgroundColor = hasCompareRuns
-                                ? getComparisonColor(
-                                      typeof record.compareIndex === "number"
-                                          ? record.compareIndex
-                                          : 0,
-                                  )
-                                : "#fff"
-
-                            return {
-                                onClick: (event) => {
-                                    const target = event.target as HTMLElement | null
-                                    if (target?.closest("[data-ivt-stop-row-click]")) return
-                                    handleRowClick(record as TableRowData)
-                                },
-                                className: clsx({
-                                    "comparison-row": record.isComparisonRow,
+        <CellMaterializerContext.Provider value={cellMaterializer}>
+            <section className="bg-zinc-1 w-full h-full overflow-hidden flex flex-col px-2">
+                <div className="w-full grow min-h-0 overflow-auto">
+                    <InfiniteVirtualTableFeatureShell<TableRowData>
+                        /*
+                         * Remount on filter change. Applying a filter
+                         * shrinks the row set sharply; remounting resets
+                         * the virtual render window to the new length so
+                         * its row renderer can't index past the end.
+                         * Column visibility survives (localStorage-backed).
+                         */
+                        key={`scenario-table-${runId}-${JSON.stringify(effectiveFilter)}`}
+                        datasetStore={evaluationPreviewDatasetStore}
+                        tableScope={tableScope}
+                        store={store}
+                        columns={tableColumns}
+                        /*
+                         * Defensive rowKey — antd's virtual table can hand
+                         * this an out-of-range `undefined` record for a
+                         * frame while the filtered row set is shrinking.
+                         */
+                        rowKey={(record, index) => record?.key ?? `__phantom_${index ?? 0}`}
+                        tableClassName={clsx(
+                            "agenta-scenario-table",
+                            `agenta-scenario-table--row-${rowHeight}`,
+                        )}
+                        resizableColumns
+                        useSettingsDropdown
+                        settingsDropdownMenuItems={rowHeightMenuItems}
+                        columnVisibilityMenuRenderer={(
+                            controls,
+                            close,
+                            {scopeId, onExport, isExporting},
+                        ) => (
+                            <ScenarioColumnVisibilityPopoverContent
+                                controls={controls}
+                                onClose={close}
+                                scopeId={scopeId}
+                                runId={runId}
+                                evaluationType={evaluationType}
+                                onExport={onExport}
+                                isExporting={isExporting}
+                            />
+                        )}
+                        pagination={paginationForShell}
+                        exportOptions={exportOptions}
+                        tableProps={{
+                            rowClassName: (record) =>
+                                clsx("scenario-row", {
+                                    "scenario-row--comparison": record.isComparisonRow,
                                 }),
-                                style: backgroundColor ? {backgroundColor} : undefined,
-                            }
-                        },
-                    }}
-                />
-            </div>
-            <VirtualizedScenarioTableAnnotateDrawer runId={runId} />
-        </section>
+                            size: "small",
+                            sticky: true,
+                            virtual: true,
+                            bordered: true,
+                            tableLayout: "fixed",
+                            // One steady overlay while a filter scans for
+                            // its first match — replaces per-page flicker.
+                            loading: isScanning
+                                ? {spinning: true, tip: "Scanning for matches…"}
+                                : false,
+                            onRow: (record) => {
+                                const backgroundColor = hasCompareRuns
+                                    ? getComparisonColor(
+                                          typeof record.compareIndex === "number"
+                                              ? record.compareIndex
+                                              : 0,
+                                      )
+                                    : "#fff"
+
+                                return {
+                                    onClick: (event) => {
+                                        const target = event.target as HTMLElement | null
+                                        if (target?.closest("[data-ivt-stop-row-click]")) return
+                                        handleRowClick(record as TableRowData)
+                                    },
+                                    className: clsx({
+                                        "comparison-row": record.isComparisonRow,
+                                    }),
+                                    style: backgroundColor ? {backgroundColor} : undefined,
+                                }
+                            },
+                        }}
+                    />
+                </div>
+                <VirtualizedScenarioTableAnnotateDrawer runId={runId} />
+            </section>
+        </CellMaterializerContext.Provider>
     )
 }
 
diff --git a/web/oss/src/components/EvalRunDetails/atoms/metrics.ts b/web/oss/src/components/EvalRunDetails/atoms/metrics.ts
index 4c98c6d5e8..cce38b7c9d 100644
--- a/web/oss/src/components/EvalRunDetails/atoms/metrics.ts
+++ b/web/oss/src/components/EvalRunDetails/atoms/metrics.ts
@@ -13,6 +13,7 @@ import {getProjectValues} from "@/oss/state/project"
 import {previewEvalTypeAtom} from "../state/evalType"
 import {resolveValueBySegments, splitPath} from "../utils/valueAccess"
 
+import {isTerminalStatus} from "./compare"
 import {
     createMetricProcessor,
     isLegacyValueLeaf,
@@ -782,6 +783,15 @@ export const evaluationMetricQueryAtomFamily = atomFamily(
             const batcher = get(evaluationMetricBatcherFamily({runId: effectiveRunId}))
             const projectId = resolveProjectId(get)
 
+            // While the run is still executing, poll so a completing
+            // scenario's metrics surface in the table cells + focus drawer
+            // without a manual reload. Stops once the run is terminal.
+            const runQuery = effectiveRunId
+                ? get(evaluationRunQueryAtomFamily(effectiveRunId))
+                : undefined
+            const runStatus = runQuery?.data?.rawRun?.status ?? runQuery?.data?.camelRun?.status
+            const runTerminal = isTerminalStatus(runStatus)
+
             return {
                 queryKey: ["preview", "evaluation-metric", effectiveRunId, projectId, scenarioId],
                 enabled: Boolean(projectId && effectiveRunId && batcher && scenarioId),
@@ -789,6 +799,7 @@ export const evaluationMetricQueryAtomFamily = atomFamily(
                 gcTime: 5 * 60 * 1000,
                 refetchOnWindowFocus: false,
                 refetchOnReconnect: false,
+                refetchInterval: runTerminal ? false : 5000,
                 // Enable structural sharing to prevent unnecessary re-renders when data hasn't changed
                 structuralSharing: true,
                 queryFn: async () => {
diff --git a/web/oss/src/components/EvalRunDetails/atoms/scenarioSteps.ts b/web/oss/src/components/EvalRunDetails/atoms/scenarioSteps.ts
index 3cf86844fc..aca7afb8b2 100644
--- a/web/oss/src/components/EvalRunDetails/atoms/scenarioSteps.ts
+++ b/web/oss/src/components/EvalRunDetails/atoms/scenarioSteps.ts
@@ -8,7 +8,9 @@ import type {IStepResponse} from "@/oss/lib/evaluations"
 import {snakeToCamelCaseKeys} from "@/oss/lib/helpers/casing"
 import {getProjectValues} from "@/oss/state/project"
 
+import {isTerminalStatus} from "./compare"
 import {activePreviewRunIdAtom, effectiveProjectIdAtom} from "./run"
+import {evaluationRunQueryAtomFamily} from "./table/run"
 import type {ScenarioStepsBatchResult} from "./types"
 
 const scenarioStepsBatcherCache = new Map<string, BatchFetcher<string, ScenarioStepsBatchResult>>()
@@ -128,11 +130,21 @@ export const scenarioStepsQueryFamily = atomFamily(
             const effectiveRunId = resolveEffectiveRunId(get, runId)
             const batcher = get(scenarioStepsBatcherFamily({runId: effectiveRunId}))
 
+            // While the run is still executing, poll so the focus drawer /
+            // scenario viewer pick up a scenario's results as it completes.
+            // Stops once the run is terminal.
+            const runQuery = effectiveRunId
+                ? get(evaluationRunQueryAtomFamily(effectiveRunId))
+                : undefined
+            const runStatus = runQuery?.data?.rawRun?.status ?? runQuery?.data?.camelRun?.status
+            const runTerminal = isTerminalStatus(runStatus)
+
             return {
                 queryKey: ["preview", "scenario-steps", effectiveRunId, scenarioId],
                 enabled: Boolean(effectiveRunId && batcher && scenarioId),
                 refetchOnWindowFocus: false,
                 refetchOnReconnect: false,
+                refetchInterval: runTerminal ? false : 5000,
                 staleTime: 30_000,
                 gcTime: 5 * 60 * 1000,
                 // Enable structural sharing to prevent unnecessary re-renders when data hasn't changed
diff --git a/web/oss/src/components/EvalRunDetails/atoms/table/columns.ts b/web/oss/src/components/EvalRunDetails/atoms/table/columns.ts
index 824814ffe4..fd04f1fc8e 100644
--- a/web/oss/src/components/EvalRunDetails/atoms/table/columns.ts
+++ b/web/oss/src/components/EvalRunDetails/atoms/table/columns.ts
@@ -499,9 +499,19 @@ const tableColumnsBaseAtomFamily = atomFamily((runId: string | null) =>
             }
 
             const evaluator = column.evaluatorId ? evaluatorById.get(column.evaluatorId) : undefined
+            // Match the evaluator's metric definition by the canonical
+            // metric key (e.g. "attributes.ag.data.outputs.score") OR the
+            // bare value key (e.g. "score"). `extractMetrics` keys metrics
+            // by the output-schema property name — the bare key — so a
+            // canonical-key-only match misses and `metricType` falls back
+            // to "string", mis-typing the column (e.g. a boolean output).
             const metricKey = column.metricKey || column.valueKey
             const metricDefinition = evaluator?.metrics.find(
-                (metric) => metric.name === metricKey || metric.path === metricKey,
+                (metric) =>
+                    metric.name === metricKey ||
+                    metric.path === metricKey ||
+                    metric.name === column.valueKey ||
+                    metric.path === column.valueKey,
             )
             const metricType =
                 metricDefinition?.metricType || column.metricType || METRIC_TYPE_FALLBACK
diff --git a/web/oss/src/components/EvalRunDetails/components/Page.tsx b/web/oss/src/components/EvalRunDetails/components/Page.tsx
index ebad5df214..7369ee14cc 100644
--- a/web/oss/src/components/EvalRunDetails/components/Page.tsx
+++ b/web/oss/src/components/EvalRunDetails/components/Page.tsx
@@ -140,7 +140,7 @@ const EvalRunPreviewPage = ({runId, evaluationType, projectId = null}: EvalRunPr
             headerClassName="px-4 pt-2"
         >
             <div className="flex h-full min-h-0 flex-col gap-2 [&_.ant-tabs-content]:h-full [&_.ant-tabs-tabpane]:h-full">
-                <PreviewEvalRunMeta runId={runId} projectId={projectId} />
+                <PreviewEvalRunMeta runId={runId} projectId={projectId} activeView={activeView} />
                 <Tabs
                     className="flex-1 min-h-0 overflow-hidden"
                     activeKey={activeView}
diff --git a/web/oss/src/components/EvalRunDetails/components/PreviewEvalRunHeader.tsx b/web/oss/src/components/EvalRunDetails/components/PreviewEvalRunHeader.tsx
index 1653eb8f84..965029bb36 100644
--- a/web/oss/src/components/EvalRunDetails/components/PreviewEvalRunHeader.tsx
+++ b/web/oss/src/components/EvalRunDetails/components/PreviewEvalRunHeader.tsx
@@ -17,6 +17,7 @@ import {
     runTestsetIdsAtomFamily,
     runFlagsAtomFamily,
 } from "../atoms/runDerived"
+import ScenarioFilterBar from "../etl/ScenarioFilterBar"
 import {previewEvalTypeAtom} from "../state/evalType"
 
 import CompareRunsMenu from "./CompareRunsMenu"
@@ -137,10 +138,12 @@ const PreviewEvalRunMeta = ({
     runId,
     projectId,
     className,
+    activeView,
 }: {
     runId: string
     projectId?: string | null
     className?: string
+    activeView?: ActiveView
 }) => {
     const _invocationRefs = useAtomValue(useMemo(() => runInvocationRefsAtomFamily(runId), [runId]))
     const _testsetIds = useAtomValue(useMemo(() => runTestsetIdsAtomFamily(runId), [runId]))
@@ -220,6 +223,7 @@ const PreviewEvalRunMeta = ({
                         </Button>
                     </Tooltip>
                 ) : null}
+                {activeView === "scenarios" ? <ScenarioFilterBar runId={runId} /> : null}
                 <CompareRunsMenu runId={runId} />
             </div>
         </div>
diff --git a/web/oss/src/components/EvalRunDetails/etl/EtlColumnHeader.tsx b/web/oss/src/components/EvalRunDetails/etl/EtlColumnHeader.tsx
new file mode 100644
index 0000000000..d69faa29e0
--- /dev/null
+++ b/web/oss/src/components/EvalRunDetails/etl/EtlColumnHeader.tsx
@@ -0,0 +1,74 @@
+/**
+ * EtlColumnHeader
+ *
+ * Renders the nested-header label for a column group. The default
+ * `computeColumnGroup` resolver falls back to `Testset <slug>` /
+ * `Application <slug>` because it doesn't fetch the entity itself.
+ *
+ * This header is that override — same pattern production's
+ * `StepGroupHeader` uses: subscribe to the entity reference atom by ID
+ * and surface the entity's name when available, fall back to the slug
+ * otherwise. Evaluator + metrics + other groups already carry
+ * `slugToTitle`-rendered labels, so no entity lookup is needed.
+ */
+
+import {useMemo} from "react"
+
+import type {ColumnGroup} from "@agenta/entities/evaluationRun/etl"
+import {Tooltip} from "antd"
+import {atom, useAtomValue} from "jotai"
+
+import {
+    applicationReferenceQueryAtomFamily,
+    testsetReferenceQueryAtomFamily,
+} from "../atoms/references"
+
+const emptyAtom = atom<{data: {name?: string; slug?: string} | null} | null>(null)
+
+interface EtlColumnHeaderProps {
+    group: ColumnGroup
+}
+
+const pickName = (entity: unknown): string | null => {
+    if (!entity || typeof entity !== "object") return null
+    const name = (entity as {name?: unknown}).name
+    return typeof name === "string" && name.length > 0 ? name : null
+}
+
+const EtlColumnHeader = ({group}: EtlColumnHeaderProps) => {
+    const refAtom = useMemo(() => {
+        if (group.kind === "testset") {
+            const id = (group.refs?.testset as {id?: string} | undefined)?.id
+            return id ? testsetReferenceQueryAtomFamily(id) : emptyAtom
+        }
+        if (group.kind === "application") {
+            const id = (group.refs?.application as {id?: string} | undefined)?.id
+            return id ? applicationReferenceQueryAtomFamily(id) : emptyAtom
+        }
+        return emptyAtom
+    }, [group])
+
+    const ref = useAtomValue(refAtom) as {data?: unknown} | null
+    const name = pickName(ref?.data ?? null)
+
+    const label = useMemo(() => {
+        switch (group.kind) {
+            case "testset":
+                return name ? `Testset ${name}` : group.label
+            case "application":
+                return name ? `Application ${name}` : group.label
+            default:
+                return group.label
+        }
+    }, [group.kind, group.label, name])
+
+    return (
+        <Tooltip title={label} placement="top">
+            <span className="block max-w-full overflow-hidden text-ellipsis whitespace-nowrap text-left">
+                {label}
+            </span>
+        </Tooltip>
+    )
+}
+
+export default EtlColumnHeader
diff --git a/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx
new file mode 100644
index 0000000000..bed7a7d6a7
--- /dev/null
+++ b/web/oss/src/components/EvalRunDetails/etl/ScenarioFilterBar.tsx
@@ -0,0 +1,412 @@
+/**
+ * ScenarioFilterBar — multi-condition AND/OR filter for the evaluation
+ * run scenarios table (decision D8).
+ *
+ * Self-contained: given only a `runId` it derives the run schema, the
+ * column value types, and the live scan status from atoms — so it can be
+ * dropped into the run header rather than sitting above the table.
+ *
+ * Follows the observability `Filters` pattern: a compact "Filters" button
+ * opens a popover holding the condition rows. Edits are staged in a draft
+ * and committed on "Apply".
+ */
+
+import {useMemo, useState} from "react"
+
+import {
+    buildFilterSchema,
+    type ColumnGroup,
+    type FilterOperator,
+    type FilterValueType,
+    type PredicateGroup,
+    type RowPredicate,
+    type RunSchema,
+} from "@agenta/entities/evaluationRun/etl"
+import {Button, Divider, Input, InputNumber, Popover, Select, Tooltip} from "antd"
+import {useAtom, useAtomValue} from "jotai"
+import {Filter as FilterIcon, Loader2, Plus, X} from "lucide-react"
+
+import {evaluationRunQueryAtomFamily, tableColumnsAtomFamily} from "../atoms/table"
+
+import {buildColumnValueTypeResolver} from "./columnValueTypes"
+import {
+    scenarioFilterAtomFamily,
+    isConditionComplete,
+    scenarioFilterStatusAtomFamily,
+} from "./scenarioFilterState"
+
+const OP_LABELS: Record<FilterOperator, string> = {
+    eq: "equals",
+    ne: "not equals",
+    lt: "<",
+    lte: "≤",
+    gt: ">",
+    gte: "≥",
+    in: "in",
+    nin: "not in",
+}
+
+// Operators offered in the UI. `in` / `nin` take a list of values (a tag
+// input); the rest take a single value.
+const UI_OPERATORS: FilterOperator[] = ["eq", "ne", "lt", "lte", "gt", "gte", "in", "nin"]
+
+/** True for operators whose value is a list rather than a scalar. */
+const isListOperator = (op: FilterOperator) => op === "in" || op === "nin"
+
+/**
+ * v1 column-kind allowlist for filtering. Only metric-related columns
+ * (evaluator outputs + metrics) are offered for now; testset (input) and
+ * application (output) columns are deliberately withheld.
+ *
+ * This is a UI allowlist only — the filter engine supports every kind.
+ * Flip a kind to `true` here to enable it; no other change is needed.
+ */
+const FILTERABLE_COLUMN_KINDS: Record<ColumnGroup["kind"], boolean> = {
+    evaluator: true,
+    metrics: true,
+    testset: false,
+    application: false,
+    other: false,
+}
+
+const EMPTY_FILTER: PredicateGroup = {op: "and", conditions: []}
+
+const encodeField = (f: {groupKind: string; groupSlug?: string | null; columnName: string}) =>
+    `${f.groupKind}|${f.groupSlug ?? ""}|${f.columnName}`
+
+const blankCondition = (): RowPredicate => ({
+    groupKind: "evaluator",
+    groupSlug: null,
+    columnName: "",
+    op: "eq",
+    value: "",
+})
+
+/** Keep antd Select dropdowns inside the popover so they don't close it. */
+const getWithinPopover = (trigger: HTMLElement) =>
+    (trigger.closest(".ant-popover") as HTMLElement | null) ?? document.body
+
+export interface ScenarioFilterBarProps {
+    runId: string
+}
+
+const ScenarioFilterBar = ({runId}: ScenarioFilterBarProps) => {
+    const [applied, setApplied] = useAtom(scenarioFilterAtomFamily(runId))
+    const {matchCount, scanning} = useAtomValue(scenarioFilterStatusAtomFamily(runId))
+    const [open, setOpen] = useState(false)
+    // Draft conditions edited inside the popover; committed on Apply.
+    const [draft, setDraft] = useState<PredicateGroup>(applied)
+
+    // Run schema (steps + mappings) — drives the filterable columns.
+    const runQuery = useAtomValue(useMemo(() => evaluationRunQueryAtomFamily(runId), [runId]))
+    const schema = useMemo<RunSchema | null>(() => {
+        const data = runQuery.data?.rawRun?.data
+        const steps = data?.steps
+        const mappings = data?.mappings
+        if (!Array.isArray(steps) || !Array.isArray(mappings)) return null
+        return {steps, mappings}
+    }, [runQuery.data])
+
+    // Column value types — sourced from the evaluator output schemas.
+    const columnResult = useAtomValue(useMemo(() => tableColumnsAtomFamily(runId), [runId]))
+    const resolveValueType = useMemo(
+        () => buildColumnValueTypeResolver(columnResult),
+        [columnResult],
+    )
+
+    const fields = useMemo(
+        () =>
+            buildFilterSchema(schema, {resolveValueType}).fields.filter(
+                (f) => FILTERABLE_COLUMN_KINDS[f.groupKind],
+            ),
+        [schema, resolveValueType],
+    )
+    const fieldByKey = useMemo(() => new Map(fields.map((f) => [encodeField(f), f])), [fields])
+    const fieldOptions = useMemo(
+        () =>
+            fields.map((f) => ({
+                value: encodeField(f),
+                label: `${f.groupLabel} · ${f.label}`,
+            })),
+        [fields],
+    )
+
+    // Run graph carries no filterable columns — hide the bar entirely.
+    if (fields.length === 0) return null
+
+    const appliedCount = applied.conditions.filter(isConditionComplete).length
+    const conditions = draft.conditions
+
+    const setConditions = (next: RowPredicate[]) => setDraft((d) => ({...d, conditions: next}))
+    const updateCondition = (index: number, partial: Partial<RowPredicate>) =>
+        setConditions(conditions.map((c, i) => (i === index ? {...c, ...partial} : c)))
+    const removeCondition = (index: number) =>
+        setConditions(conditions.filter((_, i) => i !== index))
+
+    const handleOpenChange = (next: boolean) => {
+        if (next) {
+            // Seed the draft from the applied filter (one blank row when empty).
+            setDraft(
+                applied.conditions.length > 0
+                    ? applied
+                    : {op: "and", conditions: [blankCondition()]},
+            )
+        }
+        setOpen(next)
+    }
+
+    const apply = () => {
+        setApplied({op: draft.op, conditions: draft.conditions.filter(isConditionComplete)})
+        setOpen(false)
+    }
+    const clearAll = () => {
+        setApplied(EMPTY_FILTER)
+        setDraft(EMPTY_FILTER)
+        setOpen(false)
+    }
+
+    const popoverContent = (
+        <div className="flex w-[560px] max-w-[calc(100vw-32px)] flex-col text-xs">
+            <div className="flex items-center justify-between gap-3 px-1 pb-2">
+                <span className="font-medium text-zinc-600">Filter scenarios</span>
+                {appliedCount > 0 ? (
+                    <span className="inline-flex items-center gap-1 font-normal text-zinc-500">
+                        {scanning ? <Loader2 size={12} className="animate-spin" /> : null}
+                        <span>
+                            {matchCount} {matchCount === 1 ? "match" : "matches"}
+                            {scanning ? " · scanning…" : ""}
+                        </span>
+                    </span>
+                ) : null}
+            </div>
+            <Divider className="!my-0 !mb-2" />
+
+            <div className="flex flex-col gap-1.5">
+                {conditions.map((condition, index) => {
+                    const fieldKey = condition.columnName ? encodeField(condition) : undefined
+                    const field = fieldKey ? fieldByKey.get(fieldKey) : undefined
+                    const valueType: FilterValueType = field?.valueType ?? "unknown"
+                    const ops = field
+                        ? UI_OPERATORS.filter((o) => field.operators.includes(o))
+                        : UI_OPERATORS
+
+                    return (
+                        <div key={index} className="flex items-center gap-1.5">
+                            {/*
+                             * Row-level AND/OR connector in a fixed-width
+                             * slot — so the Column select after it lines up
+                             * across every row regardless of whether the
+                             * connector is the "Where" label or the select.
+                             * The group has a single op (flat group — D8),
+                             * so every connector shows and toggles the same
+                             * value.
+                             */}
+                            <div className="flex w-20 shrink-0 items-center text-zinc-400">
+                                {index === 0 ? (
+                                    <span className="pl-2">Where</span>
+                                ) : (
+                                    <Select<"and" | "or">
+                                        variant="borderless"
+                                        className="w-full"
+                                        value={draft.op}
+                                        options={[
+                                            {label: "And", value: "and"},
+                                            {label: "Or", value: "or"},
+                                        ]}
+                                        getPopupContainer={getWithinPopover}
+                                        onChange={(op) => setDraft((d) => ({...d, op}))}
+                                    />
+                                )}
+                            </div>
+                            <Select<string>
+                                placeholder="Column"
+                                className="w-[200px] shrink-0"
+                                showSearch
+                                optionFilterProp="label"
+                                value={fieldKey}
+                                options={fieldOptions}
+                                getPopupContainer={getWithinPopover}
+                                onChange={(value) => {
+                                    const picked = fieldByKey.get(value)
+                                    if (!picked) return
+                                    const nextOps = UI_OPERATORS.filter((o) =>
+                                        picked.operators.includes(o),
+                                    )
+                                    updateCondition(index, {
+                                        groupKind: picked.groupKind,
+                                        groupSlug: picked.groupSlug,
+                                        columnName: picked.columnName,
+                                        op: nextOps[0] ?? "eq",
+                                        value: picked.valueType === "boolean" ? true : "",
+                                    })
+                                }}
+                            />
+                            <Select<FilterOperator>
+                                className="w-[110px] shrink-0"
+                                value={condition.op}
+                                disabled={!field}
+                                options={ops.map((o) => ({value: o, label: OP_LABELS[o]}))}
+                                getPopupContainer={getWithinPopover}
+                                onChange={(op) => {
+                                    // Switching between scalar and list
+                                    // operators changes the value shape —
+                                    // reset it so it stays valid.
+                                    const isList = isListOperator(op)
+                                    const wasList = Array.isArray(condition.value)
+                                    const value =
+                                        isList === wasList ? condition.value : isList ? [] : ""
+                                    updateCondition(index, {op, value})
+                                }}
+                            />
+                            <ConditionValueInput
+                                op={condition.op}
+                                valueType={valueType}
+                                value={condition.value}
+                                disabled={!field}
+                                onChange={(value) => updateCondition(index, {value})}
+                            />
+                            <Tooltip title="Remove">
+                                <Button
+                                    size="small"
+                                    type="text"
+                                    icon={<X size={14} />}
+                                    onClick={() => removeCondition(index)}
+                                />
+                            </Tooltip>
+                        </div>
+                    )
+                })}
+            </div>
+
+            <Button
+                size="small"
+                type="dashed"
+                icon={<Plus size={14} />}
+                className="mt-2 self-start"
+                onClick={() => setConditions([...conditions, blankCondition()])}
+            >
+                Add condition
+            </Button>
+
+            <Divider className="!my-2" />
+            <div className="flex items-center justify-between px-1">
+                <Button size="small" onClick={clearAll} disabled={appliedCount === 0}>
+                    Clear
+                </Button>
+                <div className="flex items-center gap-2">
+                    <Button size="small" onClick={() => setOpen(false)}>
+                        Cancel
+                    </Button>
+                    <Button size="small" type="primary" onClick={apply}>
+                        Apply
+                    </Button>
+                </div>
+            </div>
+        </div>
+    )
+
+    return (
+        <Popover
+            open={open}
+            onOpenChange={handleOpenChange}
+            trigger="click"
+            placement="bottomRight"
+            arrow={false}
+            content={popoverContent}
+        >
+            <Button
+                icon={<FilterIcon size={14} />}
+                aria-label="Filter scenarios"
+                className="inline-flex items-center gap-1"
+            >
+                <span
+                    className={`rounded-full px-1.5 text-[10px] font-medium ${
+                        appliedCount > 0 ? "bg-zinc-700 text-white" : "bg-zinc-100 text-zinc-500"
+                    }`}
+                >
+                    {appliedCount}
+                </span>
+            </Button>
+        </Popover>
+    )
+}
+
+/** Value input — shape depends on the operator and the field value type. */
+const ConditionValueInput = ({
+    op,
+    valueType,
+    value,
+    disabled,
+    onChange,
+}: {
+    op: FilterOperator
+    valueType: FilterValueType
+    value: unknown
+    disabled: boolean
+    onChange: (value: unknown) => void
+}) => {
+    // `in` / `nin` — a list of values entered as tags.
+    if (isListOperator(op)) {
+        const tags = Array.isArray(value) ? value.map((v) => String(v)) : []
+        return (
+            <Select
+                mode="tags"
+                className="w-full"
+                placeholder="Add values…"
+                disabled={disabled}
+                value={tags}
+                open={false}
+                suffixIcon={null}
+                tokenSeparators={[","]}
+                getPopupContainer={getWithinPopover}
+                onChange={(vals: string[]) => {
+                    const coerced =
+                        valueType === "number"
+                            ? vals.map(Number).filter((n) => !Number.isNaN(n))
+                            : vals
+                    onChange(coerced)
+                }}
+            />
+        )
+    }
+    if (valueType === "boolean") {
+        // antd Select option values must be string|number — encode the
+        // boolean as a string and decode on change.
+        return (
+            <Select<string>
+                className="w-full"
+                placeholder="Value"
+                disabled={disabled}
+                value={value === true ? "true" : value === false ? "false" : undefined}
+                options={[
+                    {label: "true", value: "true"},
+                    {label: "false", value: "false"},
+                ]}
+                getPopupContainer={getWithinPopover}
+                onChange={(v) => onChange(v === "true")}
+            />
+        )
+    }
+    if (valueType === "number") {
+        return (
+            <InputNumber
+                className="w-full"
+                placeholder="Value"
+                disabled={disabled}
+                value={typeof value === "number" ? value : null}
+                onChange={(v) => onChange(v ?? "")}
+            />
+        )
+    }
+    return (
+        <Input
+            className="w-full"
+            placeholder="Value"
+            disabled={disabled}
+            value={typeof value === "string" ? value : value == null ? "" : String(value)}
+            onChange={(e) => onChange(e.target.value)}
+        />
+    )
+}
+
+export default ScenarioFilterBar
diff --git a/web/oss/src/components/EvalRunDetails/etl/cellMaterializerContext.ts b/web/oss/src/components/EvalRunDetails/etl/cellMaterializerContext.ts
new file mode 100644
index 0000000000..fa2c3fd900
--- /dev/null
+++ b/web/oss/src/components/EvalRunDetails/etl/cellMaterializerContext.ts
@@ -0,0 +1,16 @@
+/**
+ * One-line context shared between the table page (provider) and the cells
+ * (consumers). Cells call `materializer.request(slice, req)` when their
+ * column's data is missing from cache; the materializer coalesces
+ * concurrent same-tick requests into one bulk fetch per slice.
+ *
+ * Kept in its own file to avoid a circular import between
+ * `EtlResolvedCell` and the table (the cell imports the context type,
+ * the page sets the context value).
+ */
+
+import {createContext} from "react"
+
+import type {CellMaterializer} from "./useCellMaterialization"
+
+export const CellMaterializerContext = createContext<CellMaterializer | null>(null)
diff --git a/web/oss/src/components/EvalRunDetails/etl/cells/EtlResolvedCell.tsx b/web/oss/src/components/EvalRunDetails/etl/cells/EtlResolvedCell.tsx
new file mode 100644
index 0000000000..d023c89162
--- /dev/null
+++ b/web/oss/src/components/EvalRunDetails/etl/cells/EtlResolvedCell.tsx
@@ -0,0 +1,377 @@
+/**
+ * EtlResolvedCell — a single cell that resolves its value from molecule caches.
+ *
+ * Each cell:
+ *   1. Subscribes to TanStack cache entries for its scenario via `useQuery`
+ *      with `enabled: false` — no network triggered from a cell render.
+ *      The hydrate / materializer paths populate those entries.
+ *   2. Assembles a HydratedScenarioRow from the four entity slices
+ *      (results / metrics / testcase / traces).
+ *   3. Runs `resolveMappings` against the hydrated row + run schema and
+ *      picks out *just this cell's* column value.
+ *
+ * Non-terminal rendering — the load-bearing difference from the PoC. The
+ * PoC fabricated `scenario.status = "success"` because it only ran
+ * against finished runs. Production scenarios can be pending / running /
+ * failed / partial, so the cell renders four distinct states:
+ *
+ *   value    — resolved, render it.
+ *   running  — scenario not terminal: an in-progress indicator. NEVER a
+ *              bare "—" (the user must tell "still computing" apart from
+ *              "computed nothing").
+ *   loading  — scenario terminal but this cell's slices not hydrated yet.
+ *   missing  — scenario terminal, slices hydrated, genuinely no value: "—".
+ */
+
+import {useContext, useEffect, useMemo} from "react"
+
+import {evaluationResultMolecule, evaluationMetricMolecule} from "@agenta/entities/evaluationRun"
+import {
+    resolveMappings,
+    unwrapStatsForCompare,
+    type RunSchema,
+    type ResolvedColumn,
+    type ColumnGroup,
+    type HydratedScenarioRow,
+    type HydratableScenario,
+} from "@agenta/entities/evaluationRun/etl"
+import {useQuery, useQueryClient} from "@tanstack/react-query"
+import {Tag} from "antd"
+import clsx from "clsx"
+import {useAtomValue} from "jotai"
+
+import {isTerminalStatus} from "../../atoms/compare"
+import {scenarioRowHeightAtom, type ScenarioRowHeight} from "../../state/rowHeight"
+import {CellMaterializerContext} from "../cellMaterializerContext"
+import {hydrationVersionAtom} from "../useHydrateScenarios"
+
+type ColumnKind = ColumnGroup["kind"]
+
+const MAX_LINES_BY_HEIGHT: Record<ScenarioRowHeight, number> = {
+    small: 4,
+    medium: 9,
+    large: 18,
+}
+
+/** Entity slices each column kind reads from. */
+const SLICES_BY_KIND: Record<ColumnKind, ("results" | "metrics" | "testcases" | "traces")[]> = {
+    testset: ["results", "testcases"],
+    application: ["results", "traces"],
+    evaluator: ["results", "metrics"],
+    metrics: ["metrics"],
+    other: ["results"],
+}
+
+export interface EtlResolvedCellProps {
+    projectId: string
+    runId: string
+    scenarioId: string
+    /** Real scenario status — drives the running / loading / missing split. */
+    scenarioStatus: string
+    /** Column the cell should render — group kind + slug + column name. */
+    columnKind: ColumnKind
+    columnGroupSlug: string | null
+    columnName: string
+    /** Run schema (steps + mappings). */
+    schema: RunSchema | null
+}
+
+const EtlResolvedCell = ({
+    projectId,
+    runId,
+    scenarioId,
+    scenarioStatus,
+    columnKind,
+    columnGroupSlug,
+    columnName,
+    schema,
+}: EtlResolvedCellProps) => {
+    const queryClient = useQueryClient()
+    const materializer = useContext(CellMaterializerContext)
+    // Bumped after each hydrate / materialize batch so cells re-render and
+    // pick up late-arriving testcase / trace cache writes.
+    const hydrationVersion = useAtomValue(hydrationVersionAtom)
+    const rowHeight = useAtomValue(scenarioRowHeightAtom)
+    const maxLines = MAX_LINES_BY_HEIGHT[rowHeight]
+
+    // Pure subscriptions — `enabled: false` + no-op queryFn means a cell
+    // render never triggers network. The hydrate / materializer paths are
+    // the only writers; cells just observe.
+    const resultsQ = useQuery<unknown>({
+        queryKey: ["evaluation-results", projectId, runId, scenarioId],
+        queryFn: () => null,
+        enabled: false,
+        staleTime: Infinity,
+    })
+    const metricsQ = useQuery<unknown>({
+        queryKey: ["evaluation-metrics", projectId, runId, scenarioId],
+        queryFn: () => null,
+        enabled: false,
+        staleTime: Infinity,
+    })
+
+    const resultsFetched = resultsQ.data !== undefined
+    const metricsFetched = metricsQ.data !== undefined
+
+    // Resolve this cell's column from the molecule caches.
+    const resolved = useMemo<ResolvedColumn | null>(() => {
+        if (!schema) return null
+
+        const results = (resultsQ.data ??
+            evaluationResultMolecule.get.byScenario({projectId, runId, scenarioId}) ??
+            []) as HydratedScenarioRow["results"]
+        const metrics = (metricsQ.data ??
+            evaluationMetricMolecule.get.byScenario({projectId, runId, scenarioId}) ??
+            []) as HydratedScenarioRow["metrics"]
+
+        const testcaseIdCandidates = results
+            .map((r) => r.testcase_id)
+            .filter((v): v is string => typeof v === "string" && v.length > 0)
+        const testcaseId = testcaseIdCandidates[0] ?? null
+        const testcase = testcaseId
+            ? (queryClient.getQueryData<HydratedScenarioRow["testcase"]>([
+                  "testcase",
+                  projectId,
+                  testcaseId,
+              ]) ?? null)
+            : null
+
+        const traces: Record<string, unknown> = {}
+        for (const r of results) {
+            if (typeof r.trace_id === "string" && r.trace_id) {
+                const cached = queryClient.getQueryData<unknown>([
+                    "trace-entity",
+                    projectId,
+                    r.trace_id,
+                ])
+                if (cached != null) traces[r.trace_id] = cached
+            }
+        }
+
+        const hydrated: HydratedScenarioRow<HydratableScenario> = {
+            scenario: {id: scenarioId, status: scenarioStatus} as HydratableScenario,
+            results,
+            metrics,
+            testcase,
+            traces,
+        }
+
+        const cols = resolveMappings(hydrated, {
+            steps: schema.steps,
+            mappings: schema.mappings,
+        })
+
+        return (
+            cols.find((c) => {
+                if (c.name !== columnName) return false
+                if (c.group.kind !== columnKind) return false
+                if (columnGroupSlug != null && c.group.slug !== columnGroupSlug) return false
+                return true
+            }) ?? null
+        )
+    }, [
+        projectId,
+        runId,
+        scenarioId,
+        scenarioStatus,
+        columnKind,
+        columnGroupSlug,
+        columnName,
+        schema,
+        resultsQ.data,
+        metricsQ.data,
+        hydrationVersion,
+        queryClient,
+    ])
+
+    // Cell-side lazy materialization. Ask the page-level materializer to
+    // fill cache slices this cell needs; the materializer coalesces
+    // concurrent same-tick requests into one bulk fetch per (slice, run).
+    useEffect(() => {
+        if (!materializer || !projectId || !runId || !scenarioId) return
+        for (const slice of SLICES_BY_KIND[columnKind]) {
+            if (slice === "results") {
+                if (!evaluationResultMolecule.get.byScenario({projectId, runId, scenarioId})) {
+                    materializer.request("results", {scenarioId, runId})
+                }
+            } else if (slice === "metrics") {
+                if (!evaluationMetricMolecule.get.byScenario({projectId, runId, scenarioId})) {
+                    materializer.request("metrics", {scenarioId, runId})
+                }
+            } else if (slice === "testcases") {
+                const cachedResults = evaluationResultMolecule.get.byScenario({
+                    projectId,
+                    runId,
+                    scenarioId,
+                })
+                const testcaseId =
+                    cachedResults?.find((r) => typeof r.testcase_id === "string" && r.testcase_id)
+                        ?.testcase_id ?? null
+                if (testcaseId) {
+                    const cached = queryClient.getQueryData(["testcase", projectId, testcaseId])
+                    if (cached == null) materializer.request("testcases", {testcaseId})
+                }
+            } else if (slice === "traces") {
+                const cachedResults = evaluationResultMolecule.get.byScenario({
+                    projectId,
+                    runId,
+                    scenarioId,
+                })
+                const traceId =
+                    cachedResults?.find((r) => typeof r.trace_id === "string" && r.trace_id)
+                        ?.trace_id ?? null
+                if (traceId) {
+                    const cached = queryClient.getQueryData(["trace-entity", projectId, traceId])
+                    if (cached == null) materializer.request("traces", {traceId})
+                }
+            }
+        }
+    }, [materializer, projectId, runId, scenarioId, columnKind, hydrationVersion, queryClient])
+
+    const hasValue = !!resolved && resolved.source !== "missing"
+
+    // Is a slice this cell needs still in flight? Distinguishes
+    // "slice-not-hydrated" (skeleton) from "genuinely missing" ("—") for
+    // a terminal scenario. A slice that the materializer marked failed is
+    // NOT counted as loading — otherwise a permanently rate-limited fetch
+    // would leave the cell on an infinite skeleton.
+    const sliceStillLoading = useMemo(() => {
+        for (const slice of SLICES_BY_KIND[columnKind]) {
+            if (slice === "results") {
+                if (!resultsFetched && !materializer?.hasFailed("results", {scenarioId, runId})) {
+                    return true
+                }
+            } else if (slice === "metrics") {
+                if (!metricsFetched && !materializer?.hasFailed("metrics", {scenarioId, runId})) {
+                    return true
+                }
+            } else if (slice === "testcases") {
+                // Needs results first — covered by the results check above.
+                if (!resultsFetched) continue
+                const cachedResults = evaluationResultMolecule.get.byScenario({
+                    projectId,
+                    runId,
+                    scenarioId,
+                })
+                const testcaseId =
+                    cachedResults?.find((r) => typeof r.testcase_id === "string" && r.testcase_id)
+                        ?.testcase_id ?? null
+                if (!testcaseId) continue
+                const cached = queryClient.getQueryData(["testcase", projectId, testcaseId])
+                if (cached === undefined && !materializer?.hasFailed("testcases", {testcaseId})) {
+                    return true
+                }
+            } else if (slice === "traces") {
+                if (!resultsFetched) continue
+                const cachedResults = evaluationResultMolecule.get.byScenario({
+                    projectId,
+                    runId,
+                    scenarioId,
+                })
+                const traceId =
+                    cachedResults?.find((r) => typeof r.trace_id === "string" && r.trace_id)
+                        ?.trace_id ?? null
+                if (!traceId) continue
+                const cached = queryClient.getQueryData(["trace-entity", projectId, traceId])
+                if (cached === undefined && !materializer?.hasFailed("traces", {traceId})) {
+                    return true
+                }
+            }
+        }
+        return false
+    }, [
+        columnKind,
+        projectId,
+        runId,
+        scenarioId,
+        resultsFetched,
+        metricsFetched,
+        materializer,
+        queryClient,
+        hydrationVersion,
+    ])
+
+    const isTerminal = isTerminalStatus(scenarioStatus)
+
+    let content: React.ReactNode
+    if (hasValue) {
+        content = (
+            <div
+                className="scenario-table-text w-full"
+                style={{
+                    display: "-webkit-box",
+                    WebkitBoxOrient: "vertical",
+                    WebkitLineClamp: maxLines,
+                    overflow: "hidden",
+                    wordBreak: "break-word",
+                }}
+            >
+                {formatValue(unwrapStatsForCompare(resolved!.value))}
+            </div>
+        )
+    } else if (!isTerminal) {
+        // Scenario not finished — in-progress, NOT a missing value.
+        content = <RunningIndicator status={scenarioStatus} />
+    } else if (sliceStillLoading) {
+        // Terminal scenario, this cell's slices not hydrated yet.
+        content = <div className="h-3 w-2/3 rounded bg-neutral-200 animate-pulse" />
+    } else {
+        // Terminal, hydrated, genuinely no value.
+        content = <span className="scenario-table-text scenario-table-placeholder">—</span>
+    }
+
+    return <div className="scenario-table-cell">{content}</div>
+}
+
+/**
+ * In-progress indicator for a non-terminal scenario's cell. A colored,
+ * pulsing dot + label — deliberately distinct from both the grey skeleton
+ * bar (data loading) and the "—" placeholder (no value).
+ */
+const RunningIndicator = ({status}: {status: string}) => {
+    const s = status.toLowerCase()
+    const dotClass = s === "running" ? "bg-blue-500" : "bg-amber-400"
+    const label = s === "running" ? "Running" : s === "queued" ? "Queued" : "Pending"
+    return (
+        <span className="inline-flex items-center gap-1.5 text-xs text-neutral-400">
+            <span className={clsx("h-1.5 w-1.5 rounded-full animate-pulse", dotClass)} />
+            {label}
+        </span>
+    )
+}
+
+/**
+ * Fixed-height placeholder for skeleton (not-yet-keyed) rows. Occupies
+ * the same `scenario-table-cell` box as a populated cell so the table
+ * doesn't jump when a skeleton row resolves to real data.
+ */
+export const EtlSkeletonCell = () => (
+    <div className="scenario-table-cell">
+        <div className="h-3 w-2/3 rounded bg-neutral-200 animate-pulse" />
+    </div>
+)
+
+function formatValue(v: unknown): React.ReactNode {
+    if (v === null || v === undefined) {
+        return <span className="scenario-table-text scenario-table-placeholder">—</span>
+    }
+    if (typeof v === "boolean") {
+        return <Tag color={v ? "green" : "red"}>{String(v)}</Tag>
+    }
+    if (typeof v === "number") {
+        return Number.isInteger(v) ? String(v) : v.toFixed(3)
+    }
+    // Cap at 800 chars as a DOM-size guard; the cell's CSS line-clamp does
+    // the visible truncation.
+    if (typeof v === "string") {
+        return v.length > 800 ? v.slice(0, 800) : v
+    }
+    try {
+        const json = JSON.stringify(v)
+        return json.length > 800 ? json.slice(0, 800) : json
+    } catch {
+        return String(v)
+    }
+}
+
+export default EtlResolvedCell
diff --git a/web/oss/src/components/EvalRunDetails/etl/columnValueTypes.ts b/web/oss/src/components/EvalRunDetails/etl/columnValueTypes.ts
new file mode 100644
index 0000000000..6195b9c8d1
--- /dev/null
+++ b/web/oss/src/components/EvalRunDetails/etl/columnValueTypes.ts
@@ -0,0 +1,76 @@
+/**
+ * columnValueTypes — resolves a filterable column's value type from the
+ * evaluator output schema.
+ *
+ * The run graph does not carry column value types. The authoritative
+ * source is the evaluator's JSON output schema: `extractMetrics`
+ * (entities) reads each output property's `schema.type` into
+ * `MetricColumnDefinition.metricType`, and the backend-metadata column
+ * builder copies that onto every annotation column as
+ * `EvaluationTableColumn.metricType`.
+ *
+ * This module turns that `metricType` into the `FilterValueType` the
+ * filter bar uses, so a boolean evaluator output (e.g. an LLM-judge
+ * `success` field) is offered only equality operators and a true/false
+ * input — never the numeric comparators.
+ */
+
+import type {FilterValueType} from "@agenta/entities/evaluationRun/etl"
+
+import type {EvaluationTableColumnsResult} from "../atoms/table"
+
+/** Map a JSON-schema-derived `metricType` to a filter value type. */
+function metricTypeToValueType(metricType: string | undefined): FilterValueType | undefined {
+    if (!metricType) return undefined
+    switch (metricType.toLowerCase()) {
+        case "boolean":
+        case "bool":
+            return "boolean"
+        case "number":
+        case "integer":
+        case "float":
+            return "number"
+        case "string":
+            return "string"
+        default:
+            // array / object / anything else — no safe operator set.
+            return "unknown"
+    }
+}
+
+export interface ColumnValueTypeField {
+    groupKind: string
+    groupSlug: string | null
+    columnName: string
+}
+
+/**
+ * Build a `resolveValueType` callback for `buildFilterSchema`, sourced
+ * from the evaluator output schemas (via `columnResult` column
+ * `metricType`). Returns `undefined` for a column with no known type so
+ * `buildFilterSchema` falls back to its schema-only default.
+ */
+export function buildColumnValueTypeResolver(
+    columnResult: EvaluationTableColumnsResult | undefined,
+): (field: ColumnValueTypeField) => FilterValueType | undefined {
+    // Keyed by `<evaluatorSlug>::<columnName>` (disambiguates two
+    // evaluators with same-named outputs) and by column name alone.
+    const bySlugName = new Map<string, string>()
+    const byName = new Map<string, string>()
+
+    for (const col of columnResult?.columns ?? []) {
+        const metricType = col.metricType
+        const name = col.label
+        if (!metricType || typeof name !== "string" || !name) continue
+        byName.set(name, metricType)
+        if (col.evaluatorSlug) bySlugName.set(`${col.evaluatorSlug}::${name}`, metricType)
+    }
+
+    return (field) => {
+        const metricType =
+            (field.groupSlug
+                ? bySlugName.get(`${field.groupSlug}::${field.columnName}`)
+                : undefined) ?? byName.get(field.columnName)
+        return metricTypeToValueType(metricType)
+    }
+}
diff --git a/web/oss/src/components/EvalRunDetails/etl/scenarioFilterState.ts b/web/oss/src/components/EvalRunDetails/etl/scenarioFilterState.ts
new file mode 100644
index 0000000000..a3438fefce
--- /dev/null
+++ b/web/oss/src/components/EvalRunDetails/etl/scenarioFilterState.ts
@@ -0,0 +1,60 @@
+/**
+ * Scenario table filter state — the active multi-predicate filter
+ * (decision D8: flat AND/OR), one per run.
+ *
+ * The atom holds the *raw* filter the filter bar edits — it may contain
+ * half-built conditions (a column picked but no value yet). Evaluation
+ * uses `toEffectiveFilter`, which drops the incomplete ones, so a
+ * partially-typed condition never filters every row out.
+ */
+
+import type {PredicateGroup, RowPredicate} from "@agenta/entities/evaluationRun/etl"
+import {atom} from "jotai"
+import {atomFamily} from "jotai/utils"
+
+const EMPTY_FILTER: PredicateGroup = {op: "and", conditions: []}
+
+/** Per-run active scenario filter (raw — may contain half-built conditions). */
+export const scenarioFilterAtomFamily = atomFamily((_runId: string) =>
+    atom<PredicateGroup>(EMPTY_FILTER),
+)
+
+/** A condition is complete once it has a column and a defined, non-empty value. */
+export const isConditionComplete = (c: RowPredicate): boolean => {
+    if (!c.columnName) return false
+    // `in` / `nin` values are arrays — complete once non-empty.
+    if (Array.isArray(c.value)) return c.value.length > 0
+    return c.value !== undefined && c.value !== ""
+}
+
+/**
+ * The filter actually evaluated — half-built conditions dropped. Returns
+ * the same `op` with only complete conditions.
+ */
+export const toEffectiveFilter = (group: PredicateGroup): PredicateGroup => ({
+    op: group.op,
+    conditions: group.conditions.filter(isConditionComplete),
+})
+
+/** True when at least one complete condition is set. */
+export const isScenarioFilterActive = (group: PredicateGroup): boolean =>
+    group.conditions.some(isConditionComplete)
+
+/** Live scan status — written by the scenarios table, read by the filter bar. */
+export interface ScenarioFilterStatus {
+    /** Confirmed matches found so far. */
+    matchCount: number
+    /** True while the filter scan is actively working. */
+    scanning: boolean
+}
+
+const EMPTY_STATUS: ScenarioFilterStatus = {matchCount: 0, scanning: false}
+
+/**
+ * Per-run filter scan status. The scenarios table runs the scan and
+ * writes this; the filter bar — which lives in the run header, a separate
+ * part of the component tree — reads it for its match-count indicator.
+ */
+export const scenarioFilterStatusAtomFamily = atomFamily((_runId: string) =>
+    atom<ScenarioFilterStatus>(EMPTY_STATUS),
+)
diff --git a/web/oss/src/components/EvalRunDetails/etl/useCellMaterialization.ts b/web/oss/src/components/EvalRunDetails/etl/useCellMaterialization.ts
new file mode 100644
index 0000000000..d1b12e2d1b
--- /dev/null
+++ b/web/oss/src/components/EvalRunDetails/etl/useCellMaterialization.ts
@@ -0,0 +1,268 @@
+/**
+ * useCellMaterialization — lazy, batched, run-aware cell-side prefetch.
+ *
+ * The page-level `useHydrateScenarios` only fetches entity slices the
+ * active predicate touches (Phase 2). In Phase 1 (no predicate) it fetches
+ * nothing, so every visible cell materializes itself.
+ *
+ * If 30 visible cells each call `molecule.actions.prefetchByScenarioIds(
+ * [scenarioId])` independently, the backend gets 30 round trips. To avoid
+ * that, this hook coalesces same-tick requests:
+ *
+ *   1. Cell asks for `(slice, {scenarioId, runId})` on first render.
+ *   2. Request is queued in a per-slice ref-set.
+ *   3. After a microtask flush, the hook drains every per-slice queue and
+ *      issues ONE bulk prefetch per (slice, runId) with all requested IDs.
+ *   4. Cells re-render via `hydrationVersionAtom` once the writes land.
+ *
+ * Run-aware: results / metrics caches are run-scoped, so the queue is
+ * grouped by `runId` and one prefetch is issued per run. This is what
+ * lets comparison rows (which carry a different `runId` than the base
+ * run) hydrate correctly.
+ */
+
+import {useEffect, useRef} from "react"
+
+import {evaluationResultMolecule, evaluationMetricMolecule} from "@agenta/entities/evaluationRun"
+import type {EntitySlice} from "@agenta/entities/evaluationRun/etl"
+import {testcaseMolecule} from "@agenta/entities/testcase"
+import {traceSpanMolecule} from "@agenta/entities/trace"
+import {getDefaultStore, useSetAtom} from "jotai"
+import {queryClientAtom} from "jotai-tanstack-query"
+
+import {hydrationVersionAtom} from "./useHydrateScenarios"
+
+interface MaterializeRequest {
+    /** scenarioId — required for results / metrics. */
+    scenarioId?: string
+    /** runId — required for results / metrics (run-scoped caches). */
+    runId?: string
+    /** testcase_id — required for testcases. */
+    testcaseId?: string
+    /** trace_id — required for traces. */
+    traceId?: string
+}
+
+interface BatchState {
+    /** Queued requests per slice. Drained on next microtask. */
+    queues: Record<EntitySlice, MaterializeRequest[]>
+    /** Per-slice "currently fetching" tracking keys so we don't double-fire. */
+    inflightKeys: Record<EntitySlice, Set<string>>
+    /**
+     * Per-slice "tried and got nothing back" tracking keys. The most
+     * common cause is HTTP 429 rate-limiting — the molecule's prefetch
+     * swallows the error and returns empty, leaving the cache empty.
+     * Without this set, the cell rerenders forever in a tight retry loop.
+     * Marked permanently for the session — user reloads to retry.
+     */
+    failedKeys: Record<EntitySlice, Set<string>>
+    /** True if a drain is already scheduled this tick. */
+    scheduled: boolean
+}
+
+const initialBatchState = (): BatchState => ({
+    queues: {results: [], metrics: [], testcases: [], traces: []},
+    inflightKeys: {
+        results: new Set(),
+        metrics: new Set(),
+        testcases: new Set(),
+        traces: new Set(),
+    },
+    failedKeys: {
+        results: new Set(),
+        metrics: new Set(),
+        testcases: new Set(),
+        traces: new Set(),
+    },
+    scheduled: false,
+})
+
+/**
+ * Stable tracking key for a (slice, request) pair. Results / metrics are
+ * run-scoped so the key includes `runId`; testcases / traces are keyed by
+ * their own id. Returns null when the request lacks the fields the slice
+ * needs.
+ */
+const trackingKey = (slice: EntitySlice, req: MaterializeRequest): string | null => {
+    if (slice === "results" || slice === "metrics") {
+        if (!req.runId || !req.scenarioId) return null
+        return `${req.runId}::${req.scenarioId}`
+    }
+    if (slice === "testcases") return req.testcaseId ?? null
+    if (slice === "traces") return req.traceId ?? null
+    return null
+}
+
+interface UseCellMaterializationArgs {
+    projectId: string | null
+    /** Page (base) run id — used only to reset state on scope change. */
+    runId: string | null
+}
+
+export interface CellMaterializer {
+    /**
+     * Request materialization of (slice, request). The hook coalesces
+     * concurrent requests on the same microtask into one bulk fetch per
+     * (slice, runId). Safe to call repeatedly from a cell's render —
+     * duplicates are deduped.
+     */
+    request: (slice: EntitySlice, req: MaterializeRequest) => void
+    /**
+     * True when a prior fetch for (slice, request) settled without
+     * populating the cache — most often a 429. Lets a cell stop showing a
+     * skeleton for a slice that will never arrive this session.
+     */
+    hasFailed: (slice: EntitySlice, req: MaterializeRequest) => boolean
+}
+
+const groupScenariosByRun = (reqs: MaterializeRequest[]): Map<string, string[]> => {
+    const out = new Map<string, string[]>()
+    for (const r of reqs) {
+        if (!r.runId || !r.scenarioId) continue
+        const arr = out.get(r.runId) ?? []
+        if (!arr.includes(r.scenarioId)) arr.push(r.scenarioId)
+        out.set(r.runId, arr)
+    }
+    return out
+}
+
+const dedupField = (reqs: MaterializeRequest[], field: "testcaseId" | "traceId"): string[] => {
+    const out = new Set<string>()
+    for (const r of reqs) {
+        const v = r[field]
+        if (typeof v === "string" && v) out.add(v)
+    }
+    return Array.from(out)
+}
+
+export const useCellMaterialization = ({
+    projectId,
+    runId,
+}: UseCellMaterializationArgs): CellMaterializer => {
+    const stateRef = useRef<BatchState>(initialBatchState())
+    const bumpHydrationVersion = useSetAtom(hydrationVersionAtom)
+
+    useEffect(() => {
+        // Reset on scope change.
+        stateRef.current = initialBatchState()
+    }, [projectId, runId])
+
+    const drain = async () => {
+        const state = stateRef.current
+        state.scheduled = false
+        if (!projectId) return
+
+        // Snapshot + reset the queues — new requests can queue while
+        // we're fetching, those trigger their own drain.
+        const queues = state.queues
+        state.queues = {results: [], metrics: [], testcases: [], traces: []}
+
+        const resultsByRun = groupScenariosByRun(queues.results)
+        const metricsByRun = groupScenariosByRun(queues.metrics)
+        const testcaseIds = dedupField(queues.testcases, "testcaseId")
+        const traceIds = dedupField(queues.traces, "traceId")
+
+        // Mark in-flight before starting fetch so subsequent ticks dedupe.
+        for (const [run, ids] of resultsByRun) {
+            for (const id of ids) state.inflightKeys.results.add(`${run}::${id}`)
+        }
+        for (const [run, ids] of metricsByRun) {
+            for (const id of ids) state.inflightKeys.metrics.add(`${run}::${id}`)
+        }
+        for (const id of testcaseIds) state.inflightKeys.testcases.add(id)
+        for (const id of traceIds) state.inflightKeys.traces.add(id)
+
+        const qc = getDefaultStore().get(queryClientAtom)
+
+        // After a fetch settles, for each requested id check whether the
+        // cache now holds data. If not, the fetch failed silently (most
+        // often a 429) — mark it failed so request() skips it on future
+        // renders, avoiding an infinite request → 429 → retry loop.
+        const markRunFailures = (
+            slice: "results" | "metrics",
+            run: string,
+            scenarioIds: string[],
+        ) => {
+            if (!qc) return
+            const prefix = slice === "results" ? "evaluation-results" : "evaluation-metrics"
+            for (const id of scenarioIds) {
+                const tk = `${run}::${id}`
+                state.inflightKeys[slice].delete(tk)
+                const cached = qc.getQueryData([prefix, projectId, run, id])
+                if (cached === undefined) state.failedKeys[slice].add(tk)
+            }
+        }
+        const markIdFailures = (slice: "testcases" | "traces", ids: string[]) => {
+            if (!qc) return
+            const prefix = slice === "testcases" ? "testcase" : "trace-entity"
+            for (const id of ids) {
+                state.inflightKeys[slice].delete(id)
+                const cached = qc.getQueryData([prefix, projectId, id])
+                if (cached === undefined) state.failedKeys[slice].add(id)
+            }
+        }
+
+        const tasks: Promise<unknown>[] = []
+        for (const [run, scenarioIds] of resultsByRun) {
+            tasks.push(
+                evaluationResultMolecule.actions
+                    .prefetchByScenarioIds({projectId, runId: run, scenarioIds})
+                    .finally(() => markRunFailures("results", run, scenarioIds)),
+            )
+        }
+        for (const [run, scenarioIds] of metricsByRun) {
+            tasks.push(
+                evaluationMetricMolecule.actions
+                    .prefetchByScenarioIds({projectId, runId: run, scenarioIds})
+                    .finally(() => markRunFailures("metrics", run, scenarioIds)),
+            )
+        }
+        if (testcaseIds.length > 0) {
+            tasks.push(
+                testcaseMolecule.actions
+                    .prefetchByIds({projectId, testcaseIds})
+                    .finally(() => markIdFailures("testcases", testcaseIds)),
+            )
+        }
+        if (traceIds.length > 0) {
+            tasks.push(
+                traceSpanMolecule.actions
+                    .prefetchByIds({projectId, traceIds})
+                    .finally(() => markIdFailures("traces", traceIds)),
+            )
+        }
+
+        try {
+            await Promise.all(tasks)
+            // Bump so cells re-render and pick up their newly-cached data.
+            if (tasks.length > 0) bumpHydrationVersion((v) => v + 1)
+        } catch (e) {
+            console.warn("[useCellMaterialization] batch failed:", e)
+        }
+    }
+
+    const request: CellMaterializer["request"] = (slice, req) => {
+        const state = stateRef.current
+        const tk = trackingKey(slice, req)
+        if (!tk) return
+        // Skip if a previous drain for this key failed (most often a 429).
+        if (state.failedKeys[slice].has(tk)) return
+        // Skip if already being fetched by an earlier batch.
+        if (state.inflightKeys[slice].has(tk)) return
+        // Skip if a sibling cell already queued the same key this tick.
+        if (state.queues[slice].some((r) => trackingKey(slice, r) === tk)) return
+        state.queues[slice].push(req)
+        if (!state.scheduled) {
+            state.scheduled = true
+            queueMicrotask(drain)
+        }
+    }
+
+    const hasFailed: CellMaterializer["hasFailed"] = (slice, req) => {
+        const tk = trackingKey(slice, req)
+        if (!tk) return false
+        return stateRef.current.failedKeys[slice].has(tk)
+    }
+
+    return {request, hasFailed}
+}
diff --git a/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx b/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx
new file mode 100644
index 0000000000..ecaf63bfcc
--- /dev/null
+++ b/web/oss/src/components/EvalRunDetails/etl/useEtlColumns.tsx
@@ -0,0 +1,121 @@
+/**
+ * useEtlColumns
+ *
+ * Derives the scenario table's **schema columns** (testset / application /
+ * evaluator(s) / metrics / other) from a run's schema (steps + mappings)
+ * via `groupRunColumns`, and adapts them into nested-header IVT columns
+ * whose leaf cells mount `EtlResolvedCell`.
+ *
+ * This replaces the backend-metadata column path (`usePreviewColumns`)
+ * for the *rendered* schema columns. The meta columns (index / status,
+ * timestamp, action) and the column-visibility trigger stay on the
+ * production path — they are not schema-derived. The two are stitched
+ * together in `Table.tsx`.
+ *
+ * "other"-kind columns are kept (the PoC dropped them) so the visible
+ * column set matches the backend-metadata path.
+ */
+
+import {useMemo} from "react"
+
+import {groupRunColumns, type ColumnGroup, type RunSchema} from "@agenta/entities/evaluationRun/etl"
+import {Tooltip} from "antd"
+import type {ColumnsType} from "antd/es/table"
+
+import type {PreviewTableRow} from "../atoms/tableRows"
+
+import EtlResolvedCell, {EtlSkeletonCell} from "./cells/EtlResolvedCell"
+import EtlColumnHeader from "./EtlColumnHeader"
+
+const WIDTH_BY_KIND: Record<ColumnGroup["kind"], number> = {
+    testset: 220,
+    application: 400,
+    evaluator: 180,
+    metrics: 140,
+    other: 180,
+}
+
+export interface UseEtlColumnsArgs {
+    projectId: string | null
+    runId: string | null
+    schema: RunSchema | null
+}
+
+/**
+ * Schema columns for the scenario table, as nested-header IVT columns.
+ * Empty until the run schema is available.
+ */
+export const useEtlColumns = ({
+    projectId,
+    runId,
+    schema,
+}: UseEtlColumnsArgs): ColumnsType<PreviewTableRow> => {
+    return useMemo<ColumnsType<PreviewTableRow>>(() => {
+        if (!schema || !projectId || !runId) return []
+
+        // "metrics"-kind columns are intentionally skipped here. The
+        // scenario table's "Metrics" group is the *static* invocation
+        // metrics (cost / duration / tokens) injected by the
+        // backend-metadata column path — not run-mapping-derived — so that
+        // group is kept on the production path in `Table.tsx` and rendered
+        // by the existing metric cell. Emitting an ETL metrics group too
+        // would duplicate it.
+        const grouped = groupRunColumns(schema.steps, schema.mappings).filter(
+            (g) => g.group.kind !== "metrics",
+        )
+
+        return grouped.map((g) => {
+            const children = g.columns.map((leaf) => {
+                const key = `${g.group.key}::${leaf.name}`
+                return {
+                    key,
+                    columnVisibilityLabel: leaf.name,
+                    title: (
+                        <Tooltip title={leaf.name} placement="top">
+                            <span className="block max-w-full overflow-hidden text-ellipsis whitespace-nowrap text-left">
+                                {leaf.name}
+                            </span>
+                        </Tooltip>
+                    ),
+                    width: WIDTH_BY_KIND[leaf.kind],
+                    minWidth: WIDTH_BY_KIND[leaf.kind],
+                    ellipsis: true,
+                    align: "left" as const,
+                    render: (_: unknown, record: PreviewTableRow) => {
+                        // antd's virtual table can briefly call a cell
+                        // render with an out-of-range `undefined` record
+                        // while the (filtered) dataSource is shrinking —
+                        // render nothing for those phantom rows.
+                        if (record == null) return null
+                        // Skeleton / not-yet-keyed rows (incl. comparison
+                        // placeholders) render a fixed-height placeholder.
+                        if (record.__isSkeleton || !record.scenarioId) {
+                            return <EtlSkeletonCell />
+                        }
+                        return (
+                            <EtlResolvedCell
+                                projectId={projectId}
+                                // Comparison rows carry their own runId.
+                                runId={record.runId ?? runId}
+                                scenarioId={record.scenarioId}
+                                scenarioStatus={record.status}
+                                columnKind={leaf.kind}
+                                columnGroupSlug={leaf.groupSlug}
+                                columnName={leaf.name}
+                                schema={schema}
+                            />
+                        )
+                    },
+                }
+            })
+
+            return {
+                key: g.group.key,
+                columnVisibilityLabel: g.group.label,
+                title: <EtlColumnHeader group={g.group} />,
+                align: "left" as const,
+                children,
+            }
+        })
+    }, [projectId, runId, schema])
+}
diff --git a/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts b/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts
new file mode 100644
index 0000000000..560ea14eec
--- /dev/null
+++ b/web/oss/src/components/EvalRunDetails/etl/useHydrateScenarios.ts
@@ -0,0 +1,266 @@
+/**
+ * useHydrateScenarios
+ *
+ * Watches the scenario rows the table has loaded and triggers a bulk
+ * hydrate pass per *new* page — bulk requests per page, all entities
+ * populated together.
+ *
+ * Flow per newly-seen scenario set:
+ *   1. evaluationResultMolecule.actions.prefetchByScenarioIds  → results
+ *   2. evaluationMetricMolecule.actions.prefetchByScenarioIds  → metrics
+ *   3. derive testcase_ids from results
+ *   4. prefetchTestcasesByIds(...)                             → testcases
+ *   5. derive trace_ids from results
+ *   6. prefetchTracesByIds(...)                                → traces
+ *
+ * Cache writes go through the molecules' `setQueryData` paths, so cells
+ * subscribing via `useQuery({queryKey: cacheKey, enabled: false})` see
+ * the data the moment it lands.
+ *
+ * Phase 1 note: with no active predicate and `sliceMode === "auto"` the
+ * page-level hydrate is intentionally a no-op — cells materialize their
+ * own (visible-only) data via `useCellMaterialization`. The hook is wired
+ * now so Phase 2 filtering can drive predicate-aware page hydration
+ * without a structural change.
+ */
+
+import {useEffect, useMemo, useRef, useState} from "react"
+
+import {evaluationResultMolecule, evaluationMetricMolecule} from "@agenta/entities/evaluationRun"
+import {
+    predicateToEntitySlices,
+    type EntitySlice,
+    type PredicateGroup,
+    type RowPredicate,
+    type RunSchema,
+} from "@agenta/entities/evaluationRun/etl"
+import {prefetchTestcasesByIds} from "@agenta/entities/testcase"
+import {prefetchTracesByIds} from "@agenta/entities/trace"
+import {atom, useSetAtom} from "jotai"
+
+const ALL_SLICES: EntitySlice[] = ["results", "metrics", "testcases", "traces"]
+
+/**
+ * Minimal row shape this hook reads — identity + skeleton flag. Kept
+ * structural (fields `unknown`) so it accepts both `PreviewTableRow[]` and
+ * the loosely-typed `InfiniteTableRowBase[]` the IVT pagination hook
+ * returns, without coupling to either.
+ */
+export interface HydratableRowRef {
+    scenarioId?: unknown
+    __isSkeleton?: unknown
+}
+
+/**
+ * Hydration-version atom — bumped each time a hydrate / materialize batch
+ * completes. Cells subscribe to it so they re-render and pick up
+ * late-arriving testcase / trace cache writes (whose IDs aren't known
+ * until results land). Cheap: number atom, single React tick per batch.
+ */
+export const hydrationVersionAtom = atom(0)
+
+export interface HydrationProgress {
+    /** Total unique scenario IDs hydrated since mount. */
+    hydratedScenarios: number
+    /** Pages observed (one bulk hydrate pass per page). */
+    pagesHydrated: number
+    /** Which entity slices the next page load will fetch. */
+    activeSlices: EntitySlice[]
+    /** Last error from any prefetch call, or null. */
+    lastError: string | null
+    /** True while a hydrate pass is mid-flight. */
+    isHydrating: boolean
+}
+
+const INITIAL_PROGRESS: HydrationProgress = {
+    hydratedScenarios: 0,
+    pagesHydrated: 0,
+    activeSlices: ALL_SLICES,
+    lastError: null,
+    isHydrating: false,
+}
+
+/**
+ * Slice-fetch strategy for the page-level hydrate.
+ *
+ * - "auto" (default): fetch only what's needed right now. With an active
+ *   predicate that's the predicate's slice set; with no predicate that's
+ *   zero slices — cells materialize their own data on first render
+ *   (visible-only, virtualization-aware).
+ * - "all": always fetch all 4 slices. For workflows that need every
+ *   column populated up-front (exports, bulk actions).
+ */
+export type SliceFetchMode = "auto" | "all"
+
+export interface UseHydrateScenariosArgs {
+    projectId: string | null
+    runId: string | null
+    rows: readonly HydratableRowRef[]
+    /** Run schema — maps an active predicate's column to entity slices. */
+    schema?: RunSchema | null
+    /**
+     * Active filter — a single predicate, a predicate array, or a flat
+     * AND/OR `PredicateGroup` (Phase 2). When present, page-level hydrate
+     * fetches the entity slices the filter needs so it can be evaluated.
+     */
+    predicate?: RowPredicate | RowPredicate[] | PredicateGroup | null
+    /** Hydrate strategy — see `SliceFetchMode`. Default "auto". */
+    sliceMode?: SliceFetchMode
+}
+
+export const useHydrateScenarios = ({
+    projectId,
+    runId,
+    rows,
+    schema = null,
+    predicate = null,
+    sliceMode = "auto",
+}: UseHydrateScenariosArgs): HydrationProgress => {
+    const [progress, setProgress] = useState<HydrationProgress>(INITIAL_PROGRESS)
+    const hydratedScenarioIdsRef = useRef<Set<string>>(new Set())
+    const inflightRef = useRef<Promise<void> | null>(null)
+    const bumpHydrationVersion = useSetAtom(hydrationVersionAtom)
+
+    // Compute the slice set this hydrate pass should fetch.
+    const activeSlices = useMemo<EntitySlice[]>(() => {
+        if (sliceMode === "all") return ALL_SLICES
+        const result = predicateToEntitySlices(schema, predicate)
+        if (result.fallbackToAll) return ALL_SLICES
+        if (result.slices.size === 0) {
+            // No predicate active in auto mode → page-level hydrate is a
+            // no-op. Cells materialize what they need on first render.
+            return []
+        }
+        // Always include results when testcases or traces are needed —
+        // those IDs live on result rows.
+        const slices = new Set<EntitySlice>(result.slices)
+        if (slices.has("testcases") || slices.has("traces")) slices.add("results")
+        return ALL_SLICES.filter((s) => slices.has(s))
+    }, [schema, predicate, sliceMode])
+
+    const activeSlicesKey = activeSlices.join(",")
+    useEffect(() => {
+        hydratedScenarioIdsRef.current = new Set()
+        setProgress({...INITIAL_PROGRESS, activeSlices})
+    }, [projectId, runId, activeSlicesKey])
+
+    useEffect(() => {
+        if (!projectId || !runId) return
+        // Only consider materialized (non-skeleton) scenarios with real IDs.
+        const candidateIds = rows
+            .filter(
+                (r) =>
+                    !r.__isSkeleton && typeof r.scenarioId === "string" && r.scenarioId.length > 0,
+            )
+            .map((r) => r.scenarioId as string)
+
+        const seen = hydratedScenarioIdsRef.current
+        const newIds = candidateIds.filter((id) => !seen.has(id))
+        if (newIds.length === 0) return
+
+        const slicesToFetch = new Set(activeSlices)
+        // Pure on-demand mode: nothing to fetch at the page level. Cells
+        // handle their own materialization via useCellMaterialization.
+        if (slicesToFetch.size === 0) {
+            for (const id of newIds) seen.add(id)
+            setProgress((p) => ({
+                ...p,
+                hydratedScenarios: p.hydratedScenarios + newIds.length,
+                pagesHydrated: p.pagesHydrated + 1,
+                isHydrating: false,
+                lastError: null,
+            }))
+            return
+        }
+
+        // Mark optimistically so a re-render mid-flight doesn't queue
+        // duplicate prefetch calls for the same scenarios.
+        for (const id of newIds) seen.add(id)
+
+        const emptyOutcome = {cacheHits: 0, cacheMisses: 0, fetchMs: 0}
+
+        const hydrateBatch = async () => {
+            setProgress((p) => ({...p, isHydrating: true, lastError: null}))
+            try {
+                // Stage 1 — results + metrics (parallel).
+                const [resultsOutcome] = await Promise.all([
+                    slicesToFetch.has("results")
+                        ? evaluationResultMolecule.actions.prefetchByScenarioIds({
+                              projectId,
+                              runId,
+                              scenarioIds: newIds,
+                          })
+                        : Promise.resolve({
+                              ...emptyOutcome,
+                              results: [],
+                              byScenarioId: new Map<string, never>(),
+                          }),
+                    slicesToFetch.has("metrics")
+                        ? evaluationMetricMolecule.actions.prefetchByScenarioIds({
+                              projectId,
+                              runId,
+                              scenarioIds: newIds,
+                          })
+                        : Promise.resolve(null),
+                ])
+
+                // Stage 2 — derive testcase_ids + trace_ids from results.
+                const testcaseIds = new Set<string>()
+                if (slicesToFetch.has("testcases")) {
+                    for (const result of resultsOutcome.results) {
+                        if (typeof result.testcase_id === "string" && result.testcase_id) {
+                            testcaseIds.add(result.testcase_id)
+                        }
+                    }
+                }
+
+                const traceIds = new Set<string>()
+                if (slicesToFetch.has("traces")) {
+                    for (const result of resultsOutcome.results) {
+                        if (typeof result.trace_id === "string" && result.trace_id) {
+                            traceIds.add(result.trace_id)
+                        }
+                    }
+                }
+
+                await Promise.all([
+                    testcaseIds.size > 0
+                        ? prefetchTestcasesByIds({
+                              projectId,
+                              testcaseIds: Array.from(testcaseIds),
+                          })
+                        : Promise.resolve(emptyOutcome),
+                    traceIds.size > 0
+                        ? prefetchTracesByIds({
+                              projectId,
+                              traceIds: Array.from(traceIds),
+                          })
+                        : Promise.resolve(emptyOutcome),
+                ])
+
+                setProgress((p) => ({
+                    hydratedScenarios: p.hydratedScenarios + newIds.length,
+                    pagesHydrated: p.pagesHydrated + 1,
+                    activeSlices,
+                    lastError: null,
+                    isHydrating: false,
+                }))
+                bumpHydrationVersion((v) => v + 1)
+            } catch (e) {
+                // On failure, un-mark so the next render can retry.
+                for (const id of newIds) seen.delete(id)
+                setProgress((p) => ({
+                    ...p,
+                    lastError: e instanceof Error ? e.message : String(e),
+                    isHydrating: false,
+                }))
+            }
+        }
+
+        // Serialize hydrate calls — multiple page-loads in quick
+        // succession get queued, not parallel.
+        inflightRef.current = (inflightRef.current ?? Promise.resolve()).then(hydrateBatch)
+    }, [projectId, runId, rows, activeSlicesKey, activeSlices, bumpHydrationVersion])
+
+    return progress
+}
diff --git a/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts b/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts
new file mode 100644
index 0000000000..7458d9fc2a
--- /dev/null
+++ b/web/oss/src/components/EvalRunDetails/etl/useScenarioFilter.ts
@@ -0,0 +1,197 @@
+/**
+ * useScenarioFilter — applies the active multi-predicate filter (D8) to
+ * the scenario rows.
+ *
+ * While a filter is active the result holds **confirmed matches only** —
+ * a row appears once it is hydrated AND satisfies the filter. The list
+ * therefore only ever grows during a scan: rows materialize as their data
+ * arrives, and a row is never shown and then dropped, so the table does
+ * not flicker. Unhydrated rows and skeleton placeholders are withheld
+ * until they confirm (or are ruled out).
+ *
+ * Because a strict filter can reduce the visible row count below the
+ * viewport height, the IVT's scroll-triggered `loadMore` may never fire.
+ * While a filter is active this hook drives `loadNextPage` itself until
+ * enough confirmed matches accumulate or the dataset is exhausted.
+ */
+
+import {useEffect, useMemo} from "react"
+
+import {evaluationResultMolecule, evaluationMetricMolecule} from "@agenta/entities/evaluationRun"
+import {
+    evaluateRowFilter,
+    resolveMappings,
+    type HydratedScenarioRow,
+    type PredicateGroup,
+    type ResolvedColumn,
+    type RunSchema,
+} from "@agenta/entities/evaluationRun/etl"
+import {useQueryClient, type QueryClient} from "@tanstack/react-query"
+import {useAtomValue} from "jotai"
+
+import {
+    scenarioFilterAtomFamily,
+    isScenarioFilterActive,
+    toEffectiveFilter,
+} from "./scenarioFilterState"
+import {hydrationVersionAtom} from "./useHydrateScenarios"
+
+/** Enough confirmed matches to fill a typical viewport before the loop stops. */
+const VIEWPORT_FILL_TARGET = 30
+
+interface FilterableRow {
+    scenarioId?: unknown
+    __isSkeleton?: unknown
+}
+
+/**
+ * Build a row's resolved columns from the molecule caches. Returns `null`
+ * when nothing is hydrated yet for the scenario (results + metrics both
+ * empty) — the caller treats that as "not known yet".
+ */
+function resolveScenarioColumnsFromCache(
+    queryClient: QueryClient,
+    projectId: string,
+    runId: string,
+    scenarioId: string,
+    schema: RunSchema,
+): ResolvedColumn[] | null {
+    const results = (evaluationResultMolecule.get.byScenario({projectId, runId, scenarioId}) ??
+        []) as HydratedScenarioRow["results"]
+    const metrics = (evaluationMetricMolecule.get.byScenario({projectId, runId, scenarioId}) ??
+        []) as HydratedScenarioRow["metrics"]
+    if (results.length === 0 && metrics.length === 0) return null
+
+    const testcaseId =
+        results.find((r) => typeof r.testcase_id === "string" && r.testcase_id)?.testcase_id ?? null
+    const testcase = testcaseId
+        ? (queryClient.getQueryData<HydratedScenarioRow["testcase"]>([
+              "testcase",
+              projectId,
+              testcaseId,
+          ]) ?? null)
+        : null
+
+    const traces: Record<string, unknown> = {}
+    for (const r of results) {
+        if (typeof r.trace_id === "string" && r.trace_id) {
+            const cached = queryClient.getQueryData<unknown>([
+                "trace-entity",
+                projectId,
+                r.trace_id,
+            ])
+            if (cached != null) traces[r.trace_id] = cached
+        }
+    }
+
+    return resolveMappings(
+        {
+            scenario: {id: scenarioId, status: "success"},
+            results,
+            metrics,
+            testcase,
+            traces,
+        },
+        {steps: schema.steps, mappings: schema.mappings},
+    )
+}
+
+export interface UseScenarioFilterArgs<TRow extends FilterableRow> {
+    projectId: string | null
+    runId: string | null
+    schema: RunSchema | null
+    /** The base (main-run) rows, pre-merge. */
+    baseRows: readonly TRow[]
+    loadNextPage: () => void
+    hasMore: boolean
+    isFetching: boolean
+}
+
+export interface UseScenarioFilterResult<TRow extends FilterableRow> {
+    /** Raw filter (may contain half-built conditions) — for the filter bar. */
+    rawFilter: PredicateGroup
+    /** Filter actually evaluated — half-built conditions dropped. */
+    effectiveFilter: PredicateGroup
+    /** True when at least one complete condition is set. */
+    active: boolean
+    /** Base rows after the filter — unfiltered when no filter is active. */
+    filteredBaseRows: TRow[]
+    /** Rows confirmed (hydrated AND matching) to satisfy the filter. */
+    confirmedMatchCount: number
+    /**
+     * True while the viewport-fill loop still intends to load more pages
+     * (a filter is active, the target match count is not yet reached, and
+     * the dataset has more pages). Goes false once enough matches are
+     * found — even if the dataset has more pages — so the UI can stop
+     * showing "scanning" when the loop is actually idle.
+     */
+    isFilling: boolean
+}
+
+export function useScenarioFilter<TRow extends FilterableRow>({
+    projectId,
+    runId,
+    schema,
+    baseRows,
+    loadNextPage,
+    hasMore,
+    isFetching,
+}: UseScenarioFilterArgs<TRow>): UseScenarioFilterResult<TRow> {
+    const queryClient = useQueryClient()
+    const rawFilter = useAtomValue(scenarioFilterAtomFamily(runId ?? "__none__"))
+    // Re-evaluate when the molecule caches change.
+    const hydrationVersion = useAtomValue(hydrationVersionAtom)
+
+    const effectiveFilter = useMemo(() => toEffectiveFilter(rawFilter), [rawFilter])
+    const active = isScenarioFilterActive(rawFilter)
+
+    // Confirmed matches only — a row is included once it is hydrated AND
+    // satisfies the filter. Skeleton / unhydrated rows are withheld, so
+    // the list only grows as the scan progresses (no show-then-drop).
+    const filteredBaseRows = useMemo(() => {
+        if (!active || !schema || !projectId || !runId) return baseRows as TRow[]
+        return (baseRows as TRow[]).filter((r) => {
+            const scenarioId = typeof r.scenarioId === "string" ? r.scenarioId : null
+            // Skeleton / not-yet-keyed rows can't be evaluated — withhold.
+            if (r.__isSkeleton || !scenarioId) return false
+            const cols = resolveScenarioColumnsFromCache(
+                queryClient,
+                projectId,
+                runId,
+                scenarioId,
+                schema,
+            )
+            // Not hydrated yet — withhold until its data arrives.
+            if (!cols) return false
+            return evaluateRowFilter(effectiveFilter, cols)
+        })
+    }, [baseRows, active, schema, projectId, runId, effectiveFilter, hydrationVersion, queryClient])
+
+    // With a filter active, every row in `filteredBaseRows` is a confirmed
+    // match — so the count is just its length.
+    const confirmedMatchCount = active ? filteredBaseRows.length : 0
+
+    // The viewport-fill loop still wants more pages — i.e. the autonomous
+    // scan is genuinely in progress.
+    const isFilling = active && hasMore && confirmedMatchCount < VIEWPORT_FILL_TARGET
+
+    // Viewport-fill loop — a strict filter may keep the visible row count
+    // below the viewport, so IVT's scroll-triggered loadMore never fires.
+    // Drive it ourselves until enough confirmed matches accumulate or the
+    // dataset is exhausted.
+    useEffect(() => {
+        if (!active) return
+        if (!hasMore || isFetching) return
+        if (confirmedMatchCount >= VIEWPORT_FILL_TARGET) return
+        loadNextPage()
+    }, [active, hasMore, isFetching, confirmedMatchCount, loadNextPage])
+
+    return {
+        rawFilter,
+        effectiveFilter,
+        active,
+        filteredBaseRows,
+        confirmedMatchCount,
+        isFilling,
+    }
+}
diff --git a/web/oss/src/components/EvalRunDetails/etl/useScenarioLiveUpdates.ts b/web/oss/src/components/EvalRunDetails/etl/useScenarioLiveUpdates.ts
new file mode 100644
index 0000000000..f2234fb96b
--- /dev/null
+++ b/web/oss/src/components/EvalRunDetails/etl/useScenarioLiveUpdates.ts
@@ -0,0 +1,181 @@
+/**
+ * useScenarioLiveUpdates — keeps the ETL scenarios table fresh while a
+ * run is still executing (T6).
+ *
+ * The ETL table resolves every cell from molecule caches that are
+ * populated *once*: `useHydrateScenarios` / `useCellMaterialization`
+ * fetch a scenario's results + metrics on first sight and never again.
+ * That is correct for a finished run, but a run in progress mutates —
+ *
+ *   - a scenario's `status` flips  pending → running → success
+ *   - its results / metrics only appear once it completes
+ *
+ * Without a refresh loop a scenario that finishes after the table loaded
+ * keeps a stale empty molecule cache (an empty `[]` was cached while it
+ * was running) and a stale `running` row status, so its cells show the
+ * "Running" indicator forever.
+ *
+ * While the run is non-terminal this hook, on an interval:
+ *
+ *   1. Refetches the loaded scenario *pages* — refreshes each row's
+ *      `status` so a completed scenario's cells leave the running state.
+ *   2. Evicts + re-prefetches the results / metrics molecule caches for
+ *      every scenario that is still running, or that finished since the
+ *      last tick — replacing the stale empty cache with real values.
+ *      (`prefetchByScenarioIds` is cache-aware and would otherwise skip
+ *      an already-cached scenario, so the evict is required.)
+ *   3. Bumps `hydrationVersionAtom` so cells re-render, re-resolve, and
+ *      (via `useCellMaterialization`) pick up freshly-derivable testcase
+ *      / trace slices.
+ *
+ * One final pass runs when the run reaches a terminal status, then the
+ * loop stops.
+ */
+
+import {useCallback, useEffect, useRef} from "react"
+
+import {evaluationResultMolecule, evaluationMetricMolecule} from "@agenta/entities/evaluationRun"
+import {useSetAtom, useStore} from "jotai"
+import {queryClientAtom} from "jotai-tanstack-query"
+
+import {isTerminalStatus} from "../atoms/compare"
+import type {PreviewTableRow} from "../atoms/tableRows"
+import {evaluationPreviewTableStore} from "../evaluationPreviewTableStore"
+
+import {hydrationVersionAtom} from "./useHydrateScenarios"
+
+/** Refresh cadence — mirrors the run-status poll in `evaluationRunQueryAtomFamily`. */
+const LIVE_REFRESH_INTERVAL_MS = 5000
+
+export interface UseScenarioLiveUpdatesArgs {
+    projectId: string | null
+    runId: string | null
+    /**
+     * Latest run-level status. Drives whether the live loop runs — it
+     * stops once the run is terminal. `null` (status not loaded yet) is
+     * treated as not-live so the loop doesn't start on a blank run.
+     */
+    runStatus: string | null | undefined
+    /** Page size — addresses the base run's page-query atoms. */
+    pageSize: number
+}
+
+export const useScenarioLiveUpdates = ({
+    projectId,
+    runId,
+    runStatus,
+    pageSize,
+}: UseScenarioLiveUpdatesArgs): void => {
+    const store = useStore()
+    const bumpHydrationVersion = useSetAtom(hydrationVersionAtom)
+    /** Scenario ids observed non-terminal on the previous tick. */
+    const lastNonTerminalRef = useRef<Set<string>>(new Set())
+    /** Guard against overlapping ticks if a refresh runs long. */
+    const inflightRef = useRef(false)
+
+    const tick = useCallback(async () => {
+        if (!projectId || !runId) return
+        if (inflightRef.current) return
+        inflightRef.current = true
+        try {
+            // 1. Refetch the loaded scenario pages → refresh row statuses.
+            const qc = store.get(queryClientAtom)
+            if (qc) {
+                await qc.invalidateQueries({
+                    queryKey: [evaluationPreviewTableStore.key, runId],
+                    exact: false,
+                })
+            }
+
+            // 2. Read the fresh rows straight from the store — bypasses the
+            //    React-state lag so a just-flipped status is seen now.
+            const rows = store.get(
+                evaluationPreviewTableStore.atoms.combinedRowsAtomFamily({
+                    scopeId: runId,
+                    pageSize,
+                }),
+            ) as PreviewTableRow[]
+
+            const loadedIds = new Set<string>()
+            const currentNonTerminal = new Set<string>()
+            for (const row of rows) {
+                if (row.__isSkeleton) continue
+                const sid = row.scenarioId
+                if (typeof sid !== "string" || !sid) continue
+                loadedIds.add(sid)
+                if (!isTerminalStatus(row.status)) currentNonTerminal.add(sid)
+            }
+
+            // 3. Refresh set: scenarios still running, plus those that
+            //    finished since the last tick (their molecule cache still
+            //    holds the empty `[]` written while they were running).
+            const refreshIds = new Set<string>(currentNonTerminal)
+            for (const sid of lastNonTerminalRef.current) {
+                if (!currentNonTerminal.has(sid) && loadedIds.has(sid)) {
+                    refreshIds.add(sid)
+                }
+            }
+            lastNonTerminalRef.current = currentNonTerminal
+
+            // 4. Evict + re-prefetch the molecule caches for those scenarios.
+            if (refreshIds.size > 0) {
+                const scenarioIds = Array.from(refreshIds)
+                evaluationResultMolecule.actions.evictByScenarioIds({
+                    projectId,
+                    runId,
+                    scenarioIds,
+                })
+                evaluationMetricMolecule.actions.evictByScenarioIds({
+                    projectId,
+                    runId,
+                    scenarioIds,
+                })
+                await Promise.all([
+                    evaluationResultMolecule.actions.prefetchByScenarioIds({
+                        projectId,
+                        runId,
+                        scenarioIds,
+                    }),
+                    evaluationMetricMolecule.actions.prefetchByScenarioIds({
+                        projectId,
+                        runId,
+                        scenarioIds,
+                    }),
+                ])
+            }
+
+            // 5. Re-render cells so they re-resolve and (via the cell
+            //    materializer) re-derive testcase / trace slices.
+            bumpHydrationVersion((v) => v + 1)
+        } catch {
+            // Transient failure — the next tick retries.
+        } finally {
+            inflightRef.current = false
+        }
+    }, [projectId, runId, pageSize, store, bumpHydrationVersion])
+
+    const isLive = !!projectId && !!runId && runStatus != null && !isTerminalStatus(runStatus)
+
+    // Interval refresh while the run is non-terminal.
+    useEffect(() => {
+        if (!isLive) return
+        const id = setInterval(() => void tick(), LIVE_REFRESH_INTERVAL_MS)
+        return () => clearInterval(id)
+    }, [isLive, tick])
+
+    // One final pass when the run finishes — catches the last batch of
+    // scenarios that completed between the previous tick and the run
+    // reaching a terminal status.
+    const wasLiveRef = useRef(false)
+    const flushedRef = useRef(false)
+    useEffect(() => {
+        if (isLive) {
+            wasLiveRef.current = true
+            flushedRef.current = false
+            return
+        }
+        if (!wasLiveRef.current || flushedRef.current) return
+        flushedRef.current = true
+        void tick()
+    }, [isLive, tick])
+}
diff --git a/web/oss/src/components/EvalRunDetails/etl/useScopeChangeEviction.ts b/web/oss/src/components/EvalRunDetails/etl/useScopeChangeEviction.ts
new file mode 100644
index 0000000000..70fcb520d5
--- /dev/null
+++ b/web/oss/src/components/EvalRunDetails/etl/useScopeChangeEviction.ts
@@ -0,0 +1,52 @@
+/**
+ * useScopeChangeEviction
+ *
+ * Evicts the molecule caches the ETL hydrate path wrote when the
+ * (projectId, runId) scope changes or the table unmounts.
+ *
+ * Triggers:
+ *   - on dependency change (the *previous* scope's data gets evicted)
+ *   - on unmount (component going away — release everything we wrote)
+ *
+ * What it evicts:
+ *   - results + metrics → molecule.actions.evictByRunId (scoped to runId)
+ *   - testcase + trace-entity + span → clearCacheByPrefix (run-agnostic)
+ *
+ * Atom families are intentionally NOT cleared: other views (focus drawer,
+ * observability tab) may subscribe to the same trace atoms. A
+ * `family.clear()` would yank their subscriptions too.
+ */
+
+import {useEffect, useRef} from "react"
+
+import {evaluationResultMolecule, evaluationMetricMolecule} from "@agenta/entities/evaluationRun"
+import {clearCacheByPrefix} from "@agenta/entities/evaluationRun/etl"
+
+export interface UseScopeChangeEvictionArgs {
+    projectId: string | null
+    runId: string | null
+}
+
+export const useScopeChangeEviction = ({projectId, runId}: UseScopeChangeEvictionArgs): void => {
+    // Track the previous (projectId, runId) so the cleanup function evicts
+    // the *outgoing* scope, not the incoming one.
+    const prevRef = useRef<{projectId: string | null; runId: string | null}>({
+        projectId: null,
+        runId: null,
+    })
+
+    useEffect(() => {
+        prevRef.current = {projectId, runId}
+        return () => {
+            const {projectId: pp, runId: rr} = prevRef.current
+            if (!pp || !rr) return
+            try {
+                evaluationResultMolecule.actions.evictByRunId({projectId: pp, runId: rr})
+                evaluationMetricMolecule.actions.evictByRunId({projectId: pp, runId: rr})
+                clearCacheByPrefix(["testcase", "trace-entity", "span"])
+            } catch {
+                // QueryClient may already be torn down on app close — swallow.
+            }
+        }
+    }, [projectId, runId])
+}
diff --git a/web/oss/src/components/Filters/Filters.tsx b/web/oss/src/components/Filters/Filters.tsx
index b81cab3576..447b21a9a1 100644
--- a/web/oss/src/components/Filters/Filters.tsx
+++ b/web/oss/src/components/Filters/Filters.tsx
@@ -716,6 +716,11 @@ const Filters: React.FC<Props> = ({
 
     const isApplyDisabled = rowValidations.some(({isValid}) => !isValid)
 
+    const nonPermanentFilterCount = useMemo(
+        () => filter.filter((f) => !f.isPermanent).length,
+        [filter],
+    )
+
     const onDeleteFilter = (index: number) =>
         setFilter((prev) => prev.filter((_, idx) => idx !== index))
     const clearFilter = () => {
@@ -1665,6 +1670,7 @@ const Filters: React.FC<Props> = ({
                                             )}
 
                                             {!item.isPermanent &&
+                                                nonPermanentFilterCount > 1 &&
                                                 !(
                                                     isAnnotationFieldSelected &&
                                                     (isEvaluatorActive || isFeedbackActive)
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/filterSchema.test.ts b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/filterSchema.test.ts
new file mode 100644
index 0000000000..86bb7eed80
--- /dev/null
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/filterSchema.test.ts
@@ -0,0 +1,107 @@
+/**
+ * buildFilterSchema — derives the filterable fields the Phase 2 / T4
+ * filter UI offers (decision D8).
+ *
+ * Covers field derivation, the schema-only value-type heuristic, the
+ * type-matched operator sets, the `resolveValueType` refinement seam, and
+ * deduplication.
+ */
+
+import assert from "node:assert/strict"
+import {describe, it} from "node:test"
+
+import {buildFilterSchema, operatorsForType} from "../filterSchema"
+import type {RunSchema} from "../resolveMappings"
+
+const SCHEMA: RunSchema = {
+    steps: [
+        {key: "in", type: "input", references: {testset: {id: "t1", slug: "ts"}}},
+        {key: "ev", type: "annotation", references: {evaluator: {id: "e1", slug: "exact-match"}}},
+    ],
+    mappings: [
+        {column: {kind: "input", name: "question"}, step: {key: "in", path: "data.question"}},
+        {column: {kind: "annotation", name: "success"}, step: {key: "ev", path: "out"}},
+        // Metrics path overrides step-type grouping → "metrics" group.
+        {
+            column: {kind: "metric", name: "Cost"},
+            step: {key: "ev", path: "attributes.ag.metrics.cost"},
+        },
+    ],
+}
+
+describe("operatorsForType", () => {
+    it("number gets ordered comparisons", () => {
+        const ops = operatorsForType("number")
+        for (const op of ["lt", "lte", "gt", "gte"]) assert.ok(ops.includes(op as never))
+    })
+
+    it("unknown / boolean withhold ordered comparisons", () => {
+        for (const op of ["lt", "gt"]) {
+            assert.equal(operatorsForType("unknown").includes(op as never), false)
+            assert.equal(operatorsForType("boolean").includes(op as never), false)
+        }
+    })
+
+    it("returns a fresh array (callers may mutate)", () => {
+        const a = operatorsForType("number")
+        a.pop()
+        assert.notEqual(a.length, operatorsForType("number").length)
+    })
+})
+
+describe("buildFilterSchema", () => {
+    it("returns an empty schema for a null run schema", () => {
+        assert.deepEqual(buildFilterSchema(null), {fields: []})
+    })
+
+    it("emits one field per mapped column", () => {
+        const {fields} = buildFilterSchema(SCHEMA)
+        assert.deepEqual(fields.map((f) => f.columnName).sort(), ["Cost", "question", "success"])
+    })
+
+    it("types metrics columns as number, others as unknown", () => {
+        const {fields} = buildFilterSchema(SCHEMA)
+        const cost = fields.find((f) => f.columnName === "Cost")
+        const success = fields.find((f) => f.columnName === "success")
+        assert.equal(cost?.valueType, "number")
+        assert.equal(success?.valueType, "unknown")
+        assert.ok(cost?.operators.includes("gt"))
+        assert.equal(success?.operators.includes("gt"), false)
+    })
+
+    it("carries the targeting triple + labels", () => {
+        const {fields} = buildFilterSchema(SCHEMA)
+        const success = fields.find((f) => f.columnName === "success")
+        assert.equal(success?.groupKind, "evaluator")
+        assert.equal(success?.groupSlug, "exact-match")
+        assert.equal(success?.label, "success")
+        assert.ok(success?.groupLabel)
+    })
+
+    it("resolveValueType refines a field's type + operators", () => {
+        const {fields} = buildFilterSchema(SCHEMA, {
+            resolveValueType: (f) => (f.columnName === "success" ? "boolean" : undefined),
+        })
+        const success = fields.find((f) => f.columnName === "success")
+        assert.equal(success?.valueType, "boolean")
+        assert.deepEqual(success?.operators, ["eq", "ne"])
+        // Untouched fields keep the schema-only default.
+        assert.equal(fields.find((f) => f.columnName === "Cost")?.valueType, "number")
+    })
+
+    it("deduplicates identical (groupKind, groupSlug, columnName) triples", () => {
+        const dupSchema: RunSchema = {
+            steps: SCHEMA.steps,
+            mappings: [
+                ...SCHEMA.mappings,
+                // Same column name + same step as an existing mapping.
+                {
+                    column: {kind: "input", name: "question"},
+                    step: {key: "in", path: "data.question"},
+                },
+            ],
+        }
+        const {fields} = buildFilterSchema(dupSchema)
+        assert.equal(fields.filter((f) => f.columnName === "question").length, 1)
+    })
+})
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/groupRunColumns.test.ts b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/groupRunColumns.test.ts
new file mode 100644
index 0000000000..6026360173
--- /dev/null
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/groupRunColumns.test.ts
@@ -0,0 +1,197 @@
+/**
+ * groupRunColumns — column-parity regression guard for the ETL scenario
+ * table migration (docs/designs/eval-scenarios-table-integration.md, T2).
+ *
+ * The backend-metadata column path (`usePreviewColumns`) and the run-graph
+ * column path (`useEtlColumns` → `groupRunColumns`) must surface the SAME
+ * visible column set. The most load-bearing part of that: the PoC's
+ * `useEtlColumns` dropped `group.kind === "other"` columns ("skip in the
+ * test page"). Production must keep them — dropping them silently shrinks
+ * the user-visible column set. These tests pin that down.
+ */
+
+import assert from "node:assert/strict"
+import {describe, it} from "node:test"
+
+import {groupRunColumns, type RunMapping, type RunStep} from "../resolveMappings"
+
+// A representative testset+app+evaluator run schema. auto / human / online
+// runs all share this shape — the eval type only changes which metrics
+// show and the scenario fetch order, neither of which `groupRunColumns`
+// (a pure steps+mappings function) is aware of.
+const STEPS: RunStep[] = [
+    {key: "input", type: "input", references: {testset: {id: "ts1", slug: "my-testset"}}},
+    {
+        key: "invocation",
+        type: "invocation",
+        references: {application: {id: "app1", slug: "my-app"}},
+    },
+    {
+        key: "eval-exact",
+        type: "annotation",
+        references: {evaluator: {id: "ev1", slug: "exact-match"}},
+    },
+]
+
+const MAPPINGS: RunMapping[] = [
+    {column: {kind: "input", name: "question"}, step: {key: "input", path: "data.question"}},
+    {
+        column: {kind: "input", name: "ground_truth"},
+        step: {key: "input", path: "data.ground_truth"},
+    },
+    {
+        column: {kind: "invocation", name: "output"},
+        step: {key: "invocation", path: "attributes.ag.data.outputs"},
+    },
+    {
+        column: {kind: "annotation", name: "success"},
+        step: {key: "eval-exact", path: "attributes.ag.data.outputs.success"},
+    },
+    // Metrics path overrides the step-type grouping → "metrics" group.
+    {
+        column: {kind: "metric", name: "Cost"},
+        step: {key: "invocation", path: "attributes.ag.metrics.costs.cumulative.total"},
+    },
+]
+
+describe("groupRunColumns — testset/app/evaluator/metrics", () => {
+    it("groups columns by source in stable order", () => {
+        const grouped = groupRunColumns(STEPS, MAPPINGS)
+        assert.deepEqual(
+            grouped.map((g) => g.group.kind),
+            ["testset", "application", "evaluator", "metrics"],
+        )
+    })
+
+    it("keeps every mapped column — none dropped", () => {
+        const grouped = groupRunColumns(STEPS, MAPPINGS)
+        const total = grouped.reduce((n, g) => n + g.columns.length, 0)
+        assert.equal(total, MAPPINGS.length)
+    })
+
+    it("places multiple columns under their shared group", () => {
+        const grouped = groupRunColumns(STEPS, MAPPINGS)
+        const testset = grouped.find((g) => g.group.kind === "testset")
+        assert.ok(testset)
+        assert.deepEqual(
+            testset.columns.map((c) => c.name),
+            ["question", "ground_truth"],
+        )
+    })
+
+    it("carries group kind + slug onto each leaf", () => {
+        const grouped = groupRunColumns(STEPS, MAPPINGS)
+        const evaluator = grouped.find((g) => g.group.kind === "evaluator")
+        assert.ok(evaluator)
+        assert.equal(evaluator.columns[0].name, "success")
+        assert.equal(evaluator.columns[0].kind, "evaluator")
+        assert.equal(evaluator.columns[0].groupSlug, "exact-match")
+    })
+})
+
+describe("groupRunColumns — 'other' columns are INCLUDED (regression)", () => {
+    it("includes columns whose step has an unrecognised type", () => {
+        const steps: RunStep[] = [...STEPS, {key: "transform", type: "transform"}]
+        const mappings: RunMapping[] = [
+            ...MAPPINGS,
+            {
+                column: {kind: "transform", name: "normalized"},
+                step: {key: "transform", path: "data.normalized"},
+            },
+        ]
+        const grouped = groupRunColumns(steps, mappings)
+        const other = grouped.find((g) => g.group.kind === "other")
+        assert.ok(other, "the unrecognised-step column must produce an 'other' group")
+        assert.deepEqual(
+            other.columns.map((c) => c.name),
+            ["normalized"],
+        )
+        // "other" sorts last.
+        assert.equal(grouped[grouped.length - 1].group.kind, "other")
+    })
+
+    it("includes columns whose mapping references a missing step", () => {
+        const mappings: RunMapping[] = [
+            ...MAPPINGS,
+            {column: {kind: "meta", name: "orphan"}, step: {key: "does-not-exist", path: "x"}},
+        ]
+        const grouped = groupRunColumns(STEPS, mappings)
+        const other = grouped.find((g) => g.group.kind === "other")
+        assert.ok(other, "a mapping with no resolvable step must produce an 'other' group")
+        assert.deepEqual(
+            other.columns.map((c) => c.name),
+            ["orphan"],
+        )
+    })
+
+    it("the visible column count includes 'other' columns", () => {
+        const steps: RunStep[] = [...STEPS, {key: "transform", type: "transform"}]
+        const mappings: RunMapping[] = [
+            ...MAPPINGS,
+            {column: {name: "normalized"}, step: {key: "transform", path: "p"}},
+        ]
+        const grouped = groupRunColumns(steps, mappings)
+        const total = grouped.reduce((n, g) => n + g.columns.length, 0)
+        assert.equal(total, mappings.length)
+    })
+})
+
+describe("groupRunColumns — edge cases", () => {
+    it("skips mappings with no column name", () => {
+        const mappings: RunMapping[] = [
+            ...MAPPINGS,
+            {column: {kind: "input", name: ""}, step: {key: "input", path: "data.blank"}},
+            {column: {kind: "input"}, step: {key: "input", path: "data.noname"}},
+        ]
+        const grouped = groupRunColumns(STEPS, mappings)
+        const total = grouped.reduce((n, g) => n + g.columns.length, 0)
+        assert.equal(total, MAPPINGS.length)
+    })
+
+    it("returns an empty list for an empty schema", () => {
+        assert.deepEqual(groupRunColumns([], []), [])
+    })
+
+    it("drops internal _dedup_id columns (regression)", () => {
+        const mappings: RunMapping[] = [
+            ...MAPPINGS,
+            {
+                column: {kind: "input", name: "testcase_dedup_id"},
+                step: {key: "input", path: "data.testcase_dedup_id"},
+            },
+        ]
+        const grouped = groupRunColumns(STEPS, mappings)
+        const names = grouped.flatMap((g) => g.columns.map((c) => c.name))
+        assert.equal(names.includes("testcase_dedup_id"), false)
+        // The dedup column is excluded; every other mapped column is kept.
+        assert.equal(
+            grouped.reduce((n, g) => n + g.columns.length, 0),
+            MAPPINGS.length,
+        )
+    })
+
+    it("disambiguates two evaluators emitting the same column name", () => {
+        const steps: RunStep[] = [
+            ...STEPS,
+            {
+                key: "eval-judge",
+                type: "annotation",
+                references: {evaluator: {id: "ev2", slug: "llm-judge"}},
+            },
+        ]
+        const mappings: RunMapping[] = [
+            ...MAPPINGS,
+            {
+                column: {kind: "annotation", name: "success"},
+                step: {key: "eval-judge", path: "attributes.ag.data.outputs.success"},
+            },
+        ]
+        const grouped = groupRunColumns(steps, mappings)
+        const evaluators = grouped.filter((g) => g.group.kind === "evaluator")
+        assert.equal(evaluators.length, 2)
+        assert.deepEqual(
+            evaluators.map((g) => g.group.slug),
+            ["exact-match", "llm-judge"],
+        )
+    })
+})
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/predicateGroup.test.ts b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/predicateGroup.test.ts
new file mode 100644
index 0000000000..3c11d614c9
--- /dev/null
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/__tests__/predicateGroup.test.ts
@@ -0,0 +1,329 @@
+/**
+ * Multi-predicate AND/OR filtering (Phase 2 / T4 — decision D8).
+ *
+ * Covers the pure predicate-evaluation core: single predicate, flat AND/OR
+ * groups, the `RowFilter` dispatch, the row-level `matchesRowFilter`
+ * convenience, the `makePredicateGroupFilter` pipeline transform, and the
+ * `predicateToEntitySlices` union for group inputs.
+ */
+
+import assert from "node:assert/strict"
+import {describe, it} from "node:test"
+
+import type {Chunk} from "../../../etl/core/types"
+import type {HydratedScenarioRow} from "../hydrateScenariosTransform"
+import {predicateToEntitySlices} from "../predicateToEntitySlices"
+import type {ColumnGroup, ResolvedColumn, RunSchema} from "../resolveMappings"
+import {
+    evaluatePredicateGroup,
+    evaluateRowFilter,
+    evaluateRowPredicate,
+    isPredicateGroup,
+    makePredicateGroupFilter,
+    matchesRowFilter,
+    type PredicateGroup,
+    type RowPredicate,
+} from "../rowPredicateFilter"
+
+// A resolved column fixture — the shape `resolveMappings` emits.
+function col(opts: {
+    name: string
+    kind: ColumnGroup["kind"]
+    slug?: string | null
+    value: unknown
+}): ResolvedColumn {
+    const group: ColumnGroup = {
+        kind: opts.kind,
+        slug: opts.slug ?? null,
+        label: opts.kind,
+        key: `${opts.kind}:${opts.slug ?? "x"}`,
+        refs: null,
+    }
+    return {
+        name: opts.name,
+        kind: opts.kind,
+        stepKey: "step",
+        stepType: opts.kind,
+        path: "",
+        value: opts.value,
+        source: "metric",
+        group,
+    }
+}
+
+const COLS: ResolvedColumn[] = [
+    col({name: "success", kind: "evaluator", slug: "exact-match", value: true}),
+    col({name: "score", kind: "evaluator", slug: "llm-judge", value: 0.9}),
+    col({name: "country", kind: "testset", slug: "ts", value: "US"}),
+]
+
+// =============================================================================
+// evaluateRowPredicate — one clause
+// =============================================================================
+
+describe("evaluateRowPredicate", () => {
+    it("eq / ne", () => {
+        assert.equal(
+            evaluateRowPredicate(
+                {groupKind: "evaluator", columnName: "success", op: "eq", value: true},
+                COLS,
+            ),
+            true,
+        )
+        assert.equal(
+            evaluateRowPredicate(
+                {groupKind: "evaluator", columnName: "success", op: "ne", value: true},
+                COLS,
+            ),
+            false,
+        )
+    })
+
+    it("numeric comparisons", () => {
+        const p = (op: RowPredicate["op"], value: number): RowPredicate => ({
+            groupKind: "evaluator",
+            columnName: "score",
+            op,
+            value,
+        })
+        assert.equal(evaluateRowPredicate(p("gt", 0.8), COLS), true)
+        assert.equal(evaluateRowPredicate(p("gte", 0.9), COLS), true)
+        assert.equal(evaluateRowPredicate(p("lt", 0.8), COLS), false)
+        assert.equal(evaluateRowPredicate(p("lte", 0.9), COLS), true)
+    })
+
+    it("in / nin", () => {
+        assert.equal(
+            evaluateRowPredicate(
+                {groupKind: "testset", columnName: "country", op: "in", value: ["US", "CA"]},
+                COLS,
+            ),
+            true,
+        )
+        assert.equal(
+            evaluateRowPredicate(
+                {groupKind: "testset", columnName: "country", op: "nin", value: ["US", "CA"]},
+                COLS,
+            ),
+            false,
+        )
+    })
+
+    it("narrows by groupSlug when set", () => {
+        // Same column name across two evaluators — slug disambiguates.
+        const cols = [
+            col({name: "success", kind: "evaluator", slug: "a", value: true}),
+            col({name: "success", kind: "evaluator", slug: "b", value: false}),
+        ]
+        assert.equal(
+            evaluateRowPredicate(
+                {
+                    groupKind: "evaluator",
+                    groupSlug: "b",
+                    columnName: "success",
+                    op: "eq",
+                    value: false,
+                },
+                cols,
+            ),
+            true,
+        )
+    })
+
+    it("a missing column fails eq but passes ne (compares against undefined)", () => {
+        const p = (op: RowPredicate["op"]): RowPredicate => ({
+            groupKind: "evaluator",
+            columnName: "does-not-exist",
+            op,
+            value: true,
+        })
+        assert.equal(evaluateRowPredicate(p("eq"), COLS), false)
+        assert.equal(evaluateRowPredicate(p("ne"), COLS), true)
+    })
+
+    it("unwraps a stats-blob value before comparing", () => {
+        const cols = [
+            col({
+                name: "success",
+                kind: "evaluator",
+                value: {type: "binary", freq: [{value: true, density: 1}]},
+            }),
+        ]
+        assert.equal(
+            evaluateRowPredicate(
+                {groupKind: "evaluator", columnName: "success", op: "eq", value: true},
+                cols,
+            ),
+            true,
+        )
+    })
+})
+
+// =============================================================================
+// evaluatePredicateGroup — flat AND / OR
+// =============================================================================
+
+describe("evaluatePredicateGroup", () => {
+    const pass: RowPredicate = {
+        groupKind: "evaluator",
+        columnName: "success",
+        op: "eq",
+        value: true,
+    }
+    const fail: RowPredicate = {groupKind: "evaluator", columnName: "score", op: "gt", value: 999}
+
+    it("AND — every condition must match", () => {
+        assert.equal(evaluatePredicateGroup({op: "and", conditions: [pass, pass]}, COLS), true)
+        assert.equal(evaluatePredicateGroup({op: "and", conditions: [pass, fail]}, COLS), false)
+    })
+
+    it("OR — at least one condition must match", () => {
+        assert.equal(evaluatePredicateGroup({op: "or", conditions: [fail, pass]}, COLS), true)
+        assert.equal(evaluatePredicateGroup({op: "or", conditions: [fail, fail]}, COLS), false)
+    })
+
+    it("an empty group is no constraint — passes for both ops", () => {
+        assert.equal(evaluatePredicateGroup({op: "and", conditions: []}, COLS), true)
+        assert.equal(evaluatePredicateGroup({op: "or", conditions: []}, COLS), true)
+    })
+})
+
+// =============================================================================
+// evaluateRowFilter — dispatch + isPredicateGroup
+// =============================================================================
+
+describe("evaluateRowFilter / isPredicateGroup", () => {
+    it("isPredicateGroup distinguishes a group from a single predicate", () => {
+        const single: RowPredicate = {
+            groupKind: "evaluator",
+            columnName: "success",
+            op: "eq",
+            value: true,
+        }
+        const group: PredicateGroup = {op: "and", conditions: [single]}
+        assert.equal(isPredicateGroup(single), false)
+        assert.equal(isPredicateGroup(group), true)
+    })
+
+    it("evaluates a single predicate or a group transparently", () => {
+        const single: RowPredicate = {
+            groupKind: "evaluator",
+            columnName: "success",
+            op: "eq",
+            value: true,
+        }
+        assert.equal(evaluateRowFilter(single, COLS), true)
+        assert.equal(evaluateRowFilter({op: "or", conditions: [single]}, COLS), true)
+    })
+})
+
+// =============================================================================
+// matchesRowFilter — resolve schema, then evaluate
+// =============================================================================
+
+const ANNOTATION_SCHEMA: RunSchema = {
+    steps: [
+        {key: "eval", type: "annotation", references: {evaluator: {id: "e1", slug: "exact-match"}}},
+    ],
+    mappings: [{column: {kind: "annotation", name: "success"}, step: {key: "eval", path: "out"}}],
+}
+
+function annotationRow(success: boolean): HydratedScenarioRow {
+    return {
+        scenario: {id: "s1", status: "success"},
+        results: [],
+        // resolveFromMetric only reads `m.data` — a minimal metric is enough.
+        metrics: [{data: {eval: {out: success}}}] as unknown as HydratedScenarioRow["metrics"],
+        testcase: null,
+        traces: {},
+    }
+}
+
+describe("matchesRowFilter", () => {
+    it("resolves the run schema then evaluates the filter", () => {
+        const filter: PredicateGroup = {
+            op: "and",
+            conditions: [{groupKind: "evaluator", columnName: "success", op: "eq", value: true}],
+        }
+        assert.equal(matchesRowFilter(filter, ANNOTATION_SCHEMA, annotationRow(true)), true)
+        assert.equal(matchesRowFilter(filter, ANNOTATION_SCHEMA, annotationRow(false)), false)
+    })
+})
+
+// =============================================================================
+// makePredicateGroupFilter — pipeline transform
+// =============================================================================
+
+describe("makePredicateGroupFilter", () => {
+    it("keeps only rows the filter matches", async () => {
+        const transform = makePredicateGroupFilter({
+            filter: {
+                op: "or",
+                conditions: [
+                    {groupKind: "evaluator", columnName: "success", op: "eq", value: true},
+                ],
+            },
+            schema: ANNOTATION_SCHEMA,
+        })
+        const chunk: Chunk<HydratedScenarioRow> = {
+            items: [annotationRow(true), annotationRow(false), annotationRow(true)],
+            cursor: null,
+        }
+        const out = await transform(chunk)
+        assert.equal(out.items.length, 2)
+    })
+})
+
+// =============================================================================
+// predicateToEntitySlices — union across a group's conditions
+// =============================================================================
+
+const MIXED_SCHEMA: RunSchema = {
+    steps: [
+        {key: "in", type: "input", references: {testset: {id: "t1", slug: "ts"}}},
+        {key: "ev", type: "annotation", references: {evaluator: {id: "e1", slug: "em"}}},
+    ],
+    mappings: [
+        {column: {kind: "input", name: "question"}, step: {key: "in", path: "data.question"}},
+        {column: {kind: "annotation", name: "success"}, step: {key: "ev", path: "out"}},
+    ],
+}
+
+describe("predicateToEntitySlices — group input", () => {
+    const testsetCond: RowPredicate = {
+        groupKind: "testset",
+        columnName: "question",
+        op: "eq",
+        value: "x",
+    }
+    const evaluatorCond: RowPredicate = {
+        groupKind: "evaluator",
+        columnName: "success",
+        op: "eq",
+        value: true,
+    }
+
+    it("takes the union of every condition's slices", () => {
+        const group: PredicateGroup = {op: "and", conditions: [testsetCond, evaluatorCond]}
+        const {slices} = predicateToEntitySlices(MIXED_SCHEMA, group)
+        // testset → results + testcases; evaluator → results + metrics.
+        assert.deepEqual([...slices].sort(), ["metrics", "results", "testcases"])
+    })
+
+    it("the boolean operator does not change the slice set", () => {
+        const and = predicateToEntitySlices(MIXED_SCHEMA, {
+            op: "and",
+            conditions: [testsetCond, evaluatorCond],
+        })
+        const or = predicateToEntitySlices(MIXED_SCHEMA, {
+            op: "or",
+            conditions: [testsetCond, evaluatorCond],
+        })
+        assert.deepEqual([...and.slices].sort(), [...or.slices].sort())
+    })
+
+    it("an empty group needs no slices", () => {
+        const {slices} = predicateToEntitySlices(MIXED_SCHEMA, {op: "and", conditions: []})
+        assert.equal(slices.size, 0)
+    })
+})
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/filterSchema.ts b/web/packages/agenta-entities/src/evaluationRun/etl/filterSchema.ts
new file mode 100644
index 0000000000..273063033e
--- /dev/null
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/filterSchema.ts
@@ -0,0 +1,133 @@
+/**
+ * filterSchema — derive the set of *filterable fields* for an evaluation
+ * run from its schema (steps + mappings).
+ *
+ * The filter UI (Phase 2 / T4) needs to know, before any scenario data is
+ * loaded: which columns can be filtered, what each one's value type is,
+ * and which operators that type allows. This module produces exactly
+ * that, keyed the same way `RowPredicate` targets a column
+ * (groupKind + groupSlug + columnName), so a UI selection maps straight
+ * onto a predicate.
+ *
+ * # Value typing
+ *
+ * The run schema does not carry per-column value types — those live in
+ * evaluator output schemas / sampled values, which this module does not
+ * fetch. So typing is best-effort:
+ *
+ *   - metrics columns        → "number" (cost / duration / tokens / scores)
+ *   - everything else        → "unknown"
+ *
+ * "unknown" still gets a safe equality-oriented operator set. Callers that
+ * *can* determine a precise type (e.g. T4 wiring with access to evaluator
+ * output schemas, or by sampling resolved values) pass `resolveValueType`
+ * to refine it — that is the intended extension seam, not an edit here.
+ *
+ * @packageDocumentation
+ */
+
+import {groupRunColumns, type ColumnGroup, type RunSchema} from "./resolveMappings"
+import type {RowPredicate} from "./rowPredicateFilter"
+
+/** Value type of a filterable field — drives the operator set. */
+export type FilterValueType = "string" | "number" | "boolean" | "unknown"
+
+/** All comparison operators a `RowPredicate` supports. */
+export type FilterOperator = RowPredicate["op"]
+
+/** A single field the user can filter on. */
+export interface FilterableField {
+    /** Targeting triple — maps directly onto `RowPredicate`. */
+    groupKind: ColumnGroup["kind"]
+    groupSlug: string | null
+    columnName: string
+    /** Display label for the field (the column name). */
+    label: string
+    /** Display label for the owning group (nested-header style). */
+    groupLabel: string
+    /** Best-effort value type — "unknown" when undeterminable from the schema. */
+    valueType: FilterValueType
+    /** Operators valid for this field's type. */
+    operators: FilterOperator[]
+}
+
+export interface FilterSchema {
+    fields: FilterableField[]
+}
+
+const OPERATORS_BY_TYPE: Record<FilterValueType, FilterOperator[]> = {
+    number: ["eq", "ne", "lt", "lte", "gt", "gte", "in", "nin"],
+    string: ["eq", "ne", "in", "nin"],
+    boolean: ["eq", "ne"],
+    // Undeterminable type — equality + membership are always safe; ordered
+    // comparisons are not, so they are withheld until the type is known.
+    unknown: ["eq", "ne", "in", "nin"],
+}
+
+/** The operator set valid for a given value type. */
+export function operatorsForType(type: FilterValueType): FilterOperator[] {
+    return [...OPERATORS_BY_TYPE[type]]
+}
+
+/** Schema-only default value type — metrics are numeric, the rest unknown. */
+function defaultValueType(kind: ColumnGroup["kind"]): FilterValueType {
+    return kind === "metrics" ? "number" : "unknown"
+}
+
+export interface BuildFilterSchemaOptions {
+    /**
+     * Refine a field's value type. Return `undefined` to keep the
+     * schema-only default. This is the seam for type information that does
+     * not live in the run schema — evaluator output schemas, sampled
+     * resolved values, etc.
+     */
+    resolveValueType?: (field: {
+        groupKind: ColumnGroup["kind"]
+        groupSlug: string | null
+        columnName: string
+    }) => FilterValueType | undefined
+}
+
+/**
+ * Build the filterable-field schema for a run. Fields appear in the same
+ * group order the table renders columns (testset → application →
+ * evaluator → metrics → other). Duplicate (groupKind, groupSlug,
+ * columnName) triples are collapsed to one field.
+ */
+export function buildFilterSchema(
+    schema: RunSchema | null,
+    options: BuildFilterSchemaOptions = {},
+): FilterSchema {
+    if (!schema) return {fields: []}
+
+    const groups = groupRunColumns(schema.steps, schema.mappings)
+    const fields: FilterableField[] = []
+    const seen = new Set<string>()
+
+    for (const g of groups) {
+        for (const leaf of g.columns) {
+            const dedupKey = `${leaf.kind}::${leaf.groupSlug ?? ""}::${leaf.name}`
+            if (seen.has(dedupKey)) continue
+            seen.add(dedupKey)
+
+            const hinted = options.resolveValueType?.({
+                groupKind: leaf.kind,
+                groupSlug: leaf.groupSlug,
+                columnName: leaf.name,
+            })
+            const valueType = hinted ?? defaultValueType(leaf.kind)
+
+            fields.push({
+                groupKind: leaf.kind,
+                groupSlug: leaf.groupSlug,
+                columnName: leaf.name,
+                label: leaf.name,
+                groupLabel: g.group.label,
+                valueType,
+                operators: operatorsForType(valueType),
+            })
+        }
+    }
+
+    return {fields}
+}
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/index.ts b/web/packages/agenta-entities/src/evaluationRun/etl/index.ts
index 505e3c5cb0..4bb71e0faf 100644
--- a/web/packages/agenta-entities/src/evaluationRun/etl/index.ts
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/index.ts
@@ -41,6 +41,8 @@ export type {
     ResolveMappingsOptions,
     ColumnGroup,
     ResolvedColumnGroup,
+    RunColumnLeaf,
+    RunColumnGroup,
 } from "./resolveMappings"
 export {
     DEFAULT_STEP_RESOLVERS,
@@ -53,6 +55,7 @@ export {
     resolveMappings,
     computeColumnGroup,
     groupResolvedColumns,
+    groupRunColumns,
 } from "./resolveMappings"
 
 // Molecule-backed cache-aware fetchers — all 4 entity types go through
@@ -91,13 +94,38 @@ export {
 // Per eval-filtering.md §D2: this is the v1 frontend transform over already-
 // loaded metric data. v2 server-side filter swaps the source's `filtering`
 // param and this transform becomes a no-op.
+//
+// Multi-predicate AND/OR composition (decision D8) — `PredicateGroup` plus
+// the `evaluate*` / `matchesRowFilter` row-level entry points and the
+// `makePredicateGroupFilter` pipeline transform.
 export {
     makeRowPredicateFilter,
+    makePredicateGroupFilter,
     unwrapStatsForCompare,
+    isPredicateGroup,
+    evaluateRowPredicate,
+    evaluatePredicateGroup,
+    evaluateRowFilter,
+    matchesRowFilter,
     type RowPredicate,
+    type PredicateGroup,
+    type RowFilter,
     type PredicateFilterOptions,
+    type PredicateGroupFilterOptions,
 } from "./rowPredicateFilter"
 
+// filterSchema — derives the filterable fields (typed + type-matched
+// operators) the Phase 2 filter UI offers. Decision D8 / eval-filtering D4.
+export {
+    buildFilterSchema,
+    operatorsForType,
+    type FilterSchema,
+    type FilterableField,
+    type FilterValueType,
+    type FilterOperator,
+    type BuildFilterSchemaOptions,
+} from "./filterSchema"
+
 // Hit-ratio meter — v1→v2 escalation signal (reports the regime; doesn't
 // swap engines today). Per eval-filtering.md §D2 + §C3: tracks rolling
 // (matched/scanned) and recommends escalating to v2 when the ratio falls
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/predicateToEntitySlices.ts b/web/packages/agenta-entities/src/evaluationRun/etl/predicateToEntitySlices.ts
index 51904c7c99..4a8bca4c6a 100644
--- a/web/packages/agenta-entities/src/evaluationRun/etl/predicateToEntitySlices.ts
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/predicateToEntitySlices.ts
@@ -26,7 +26,8 @@
 
 import type {ColumnGroup, RunMapping, RunSchema, RunStep} from "./resolveMappings"
 import {computeColumnGroup} from "./resolveMappings"
-import type {RowPredicate} from "./rowPredicateFilter"
+import type {PredicateGroup, RowPredicate} from "./rowPredicateFilter"
+import {isPredicateGroup} from "./rowPredicateFilter"
 
 export type EntitySlice = "results" | "metrics" | "testcases" | "traces"
 
@@ -132,18 +133,28 @@ function sliceForPredicate(schema: RunSchema, predicate: RowPredicate): EntitySl
 /**
  * Resolve the full set of slices needed across all active predicates.
  *
+ * Accepts a single predicate, a predicate array, or a `PredicateGroup`
+ * (flat AND/OR — decision D8). For a group the slice set is the **union**
+ * of every condition's slices: evaluating either an AND or an OR needs the
+ * data behind every condition, so the boolean operator does not change the
+ * fetch set.
+ *
  * Empty predicate set = no filter active = no predicate-driven fetch
  * required. Caller decides what to do (fetch all for display, or wait
  * for cells to materialize themselves).
  */
 export function predicateToEntitySlices(
     schema: RunSchema | null,
-    predicates: RowPredicate | RowPredicate[] | null | undefined,
+    predicates: RowPredicate | RowPredicate[] | PredicateGroup | null | undefined,
 ): PredicateSliceResult {
     if (!schema || !predicates) {
         return {slices: new Set(), matchedColumns: [], fallbackToAll: false}
     }
-    const list = Array.isArray(predicates) ? predicates : [predicates]
+    const list: RowPredicate[] = Array.isArray(predicates)
+        ? predicates
+        : isPredicateGroup(predicates)
+          ? predicates.conditions
+          : [predicates]
     if (list.length === 0) {
         return {slices: new Set(), matchedColumns: [], fallbackToAll: false}
     }
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/resolveMappings.ts b/web/packages/agenta-entities/src/evaluationRun/etl/resolveMappings.ts
index c34ac1fb57..f2d9be9f86 100644
--- a/web/packages/agenta-entities/src/evaluationRun/etl/resolveMappings.ts
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/resolveMappings.ts
@@ -637,3 +637,84 @@ export function groupResolvedColumns(columns: ResolvedColumn[]): ResolvedColumnG
         return (firstAppearance.get(a.group.key) ?? 0) - (firstAppearance.get(b.group.key) ?? 0)
     })
 }
+
+// ============================================================================
+// Pre-resolution column grouping — group raw mappings by source.
+//
+// `groupResolvedColumns` above groups columns AFTER a row's values are
+// resolved. `groupRunColumns` works directly off the run schema
+// (steps + mappings), so the UI can build column headers before any
+// scenario data is hydrated.
+// ============================================================================
+
+/** A single UI column leaf, before value resolution. */
+export interface RunColumnLeaf {
+    /** Column display name (from `mapping.column.name`). */
+    name: string
+    /** Source category — testset / application / evaluator / metrics / other. */
+    kind: ColumnGroup["kind"]
+    /** The owning group's slug (null for metrics and some "other" groups). */
+    groupSlug: string | null
+}
+
+/** A group of UI columns sharing a `ColumnGroup` — one nested header. */
+export interface RunColumnGroup {
+    group: ColumnGroup
+    columns: RunColumnLeaf[]
+}
+
+/**
+ * Group a run's raw column mappings by source — testset / application /
+ * evaluator(s) / metrics / other.
+ *
+ * "other"-kind columns (steps with an unrecognised type, or mappings with
+ * no resolvable step) are **included**. They are real columns the
+ * backend-metadata column path also surfaces — dropping them would
+ * silently shrink the visible column set.
+ *
+ * Internal dedup keys (column names containing `_dedup_id`, e.g.
+ * `testcase_dedup_id`) are **excluded** — they are not user-facing
+ * columns. The backend-metadata column path drops them too.
+ *
+ * Group order: testset → application → evaluator(s) → metrics → other.
+ * Within a kind, groups appear in the order their columns first appear in
+ * the mapping list (matching `groupResolvedColumns`).
+ */
+export function groupRunColumns(steps: RunStep[], mappings: RunMapping[]): RunColumnGroup[] {
+    const stepByKey = new Map<string, RunStep>()
+    for (const s of steps) stepByKey.set(s.key, s)
+
+    const byKey = new Map<string, RunColumnGroup>()
+    const firstAppearance = new Map<string, number>()
+
+    mappings.forEach((mapping, idx) => {
+        const columnName = mapping.column?.name
+        if (typeof columnName !== "string" || !columnName) return
+        // Internal dedup keys are not user-facing columns.
+        if (columnName.includes("_dedup_id")) return
+        const step = mapping.step?.key ? (stepByKey.get(mapping.step.key) ?? null) : null
+        const path = mapping.step?.path ?? ""
+        const group = computeColumnGroup(step, path)
+
+        let slot = byKey.get(group.key)
+        if (!slot) {
+            slot = {group, columns: []}
+            byKey.set(group.key, slot)
+            firstAppearance.set(group.key, idx)
+        }
+        slot.columns.push({name: columnName, kind: group.kind, groupSlug: group.slug})
+    })
+
+    const kindOrder: Record<ColumnGroup["kind"], number> = {
+        testset: 0,
+        application: 1,
+        evaluator: 2,
+        metrics: 3,
+        other: 4,
+    }
+    return Array.from(byKey.values()).sort((a, b) => {
+        const k = kindOrder[a.group.kind] - kindOrder[b.group.kind]
+        if (k !== 0) return k
+        return (firstAppearance.get(a.group.key) ?? 0) - (firstAppearance.get(b.group.key) ?? 0)
+    })
+}
diff --git a/web/packages/agenta-entities/src/evaluationRun/etl/rowPredicateFilter.ts b/web/packages/agenta-entities/src/evaluationRun/etl/rowPredicateFilter.ts
index 8e8e712705..e36ddebed1 100644
--- a/web/packages/agenta-entities/src/evaluationRun/etl/rowPredicateFilter.ts
+++ b/web/packages/agenta-entities/src/evaluationRun/etl/rowPredicateFilter.ts
@@ -42,7 +42,12 @@
 import type {Chunk, Transform} from "../../etl/core/types"
 
 import type {HydratedScenarioRow, HydratableScenario} from "./hydrateScenariosTransform"
-import {resolveMappings, type ColumnGroup, type RunSchema} from "./resolveMappings"
+import {
+    resolveMappings,
+    type ColumnGroup,
+    type ResolvedColumn,
+    type RunSchema,
+} from "./resolveMappings"
 
 /**
  * One value-comparison clause against a single resolved column.
@@ -189,3 +194,131 @@ export function makeRowPredicateFilter<TScenario extends HydratableScenario>(
         return {...chunk, items: passing}
     }
 }
+
+// ============================================================================
+// Multi-predicate AND/OR composition (Phase 2 / T4 — decision D8)
+//
+// `RowPredicate` above is a single clause. A `PredicateGroup` joins several
+// clauses with ONE boolean operator. v1 is intentionally FLAT — `conditions`
+// are leaf predicates, not nested groups. That covers the real cases
+// ("score > 0.8 AND exact_match == true", "country in ['US','CA']") without
+// an arbitrary-tree filter UI. Nested groups can come later if needed.
+// ============================================================================
+
+/**
+ * A flat group of predicates joined by a single boolean operator.
+ *
+ * One nesting level only: `conditions` are leaf `RowPredicate`s.
+ *
+ * An **empty** group (`conditions: []`) is treated as *no constraint* —
+ * every row passes, regardless of `op`. An empty filter shows all rows.
+ */
+export interface PredicateGroup {
+    op: "and" | "or"
+    conditions: RowPredicate[]
+}
+
+/** A filter is either a single predicate or a flat AND/OR group. */
+export type RowFilter = RowPredicate | PredicateGroup
+
+/** Narrow a `RowFilter` to a `PredicateGroup`. */
+export function isPredicateGroup(filter: RowFilter): filter is PredicateGroup {
+    return Array.isArray((filter as PredicateGroup).conditions)
+}
+
+/** Find the resolved column a predicate targets (by kind + optional slug + name). */
+function findTargetColumn(
+    cols: ResolvedColumn[],
+    predicate: RowPredicate,
+): ResolvedColumn | undefined {
+    return cols.find((c) => {
+        if (c.group.kind !== predicate.groupKind) return false
+        if (predicate.groupSlug != null && c.group.slug !== predicate.groupSlug) return false
+        return c.name === predicate.columnName
+    })
+}
+
+/**
+ * Evaluate one predicate against a row's already-resolved columns.
+ *
+ * A column the predicate references but the schema doesn't surface, or one
+ * that resolved to no value, compares against `undefined` — same semantics
+ * as `makeRowPredicateFilter` (so `eq`/`lt`/… fail, `ne` passes).
+ *
+ * This is *pure*: it does not know about hydration state. "Keep a row
+ * visible until its slices are hydrated" is a wiring-layer concern — the
+ * caller decides that before calling this.
+ */
+export function evaluateRowPredicate(predicate: RowPredicate, cols: ResolvedColumn[]): boolean {
+    const target = findTargetColumn(cols, predicate)
+    const actual = unwrapStatsForCompare(target?.value)
+    return compare(actual, predicate.op, predicate.value)
+}
+
+/**
+ * Evaluate a flat AND/OR group against a row's resolved columns.
+ *
+ * - `op: "and"` — every condition must match.
+ * - `op: "or"`  — at least one condition must match.
+ * - empty `conditions` — no constraint, the row passes.
+ */
+export function evaluatePredicateGroup(group: PredicateGroup, cols: ResolvedColumn[]): boolean {
+    if (group.conditions.length === 0) return true
+    return group.op === "and"
+        ? group.conditions.every((p) => evaluateRowPredicate(p, cols))
+        : group.conditions.some((p) => evaluateRowPredicate(p, cols))
+}
+
+/** Evaluate any `RowFilter` (single predicate or AND/OR group) against resolved columns. */
+export function evaluateRowFilter(filter: RowFilter, cols: ResolvedColumn[]): boolean {
+    return isPredicateGroup(filter)
+        ? evaluatePredicateGroup(filter, cols)
+        : evaluateRowPredicate(filter, cols)
+}
+
+/**
+ * Convenience: resolve a hydrated row's columns from the run schema, then
+ * evaluate the filter. This is the row-level entry point the scenario
+ * table's `filteredRows` derivation uses.
+ */
+export function matchesRowFilter<TScenario extends HydratableScenario>(
+    filter: RowFilter,
+    schema: RunSchema,
+    row: HydratedScenarioRow<TScenario>,
+): boolean {
+    return evaluateRowFilter(filter, resolveMappings(row, schema))
+}
+
+export interface PredicateGroupFilterOptions {
+    /** The active filter — a single predicate or a flat AND/OR group. */
+    filter: RowFilter
+    /** Run schema (steps + mappings), used to resolve columns per row. */
+    schema: RunSchema
+    /** Optional per-chunk telemetry callback. */
+    onChunkFiltered?: (info: {chunk: number; scanned: number; matched: number}) => void
+}
+
+/**
+ * Build a `Transform` that keeps only rows satisfying a `RowFilter`
+ * (single predicate or AND/OR group). The ETL-pipeline counterpart of the
+ * row-level `matchesRowFilter` — use this for headless / chunked runs.
+ */
+export function makePredicateGroupFilter<TScenario extends HydratableScenario>(
+    options: PredicateGroupFilterOptions,
+): Transform<HydratedScenarioRow<TScenario>, HydratedScenarioRow<TScenario>> {
+    const {filter, schema} = options
+    let chunkIdx = 0
+
+    return async (chunk: Chunk<HydratedScenarioRow<TScenario>>) => {
+        chunkIdx++
+        const passing = chunk.items.filter((row) =>
+            evaluateRowFilter(filter, resolveMappings(row, schema)),
+        )
+        options.onChunkFiltered?.({
+            chunk: chunkIdx,
+            scanned: chunk.items.length,
+            matched: passing.length,
+        })
+        return {...chunk, items: passing}
+    }
+}
diff --git a/web/packages/agenta-entities/src/workflow/core/__tests__/schemaType.test.ts b/web/packages/agenta-entities/src/workflow/core/__tests__/schemaType.test.ts
new file mode 100644
index 0000000000..426a3cb856
--- /dev/null
+++ b/web/packages/agenta-entities/src/workflow/core/__tests__/schemaType.test.ts
@@ -0,0 +1,42 @@
+/**
+ * resolveSchemaType — regression guard for nullable evaluator output
+ * types.
+ *
+ * An evaluator output property declared nullable (`type: ["boolean",
+ * "null"]` or `anyOf: [{type: "boolean"}, {type: "null"}]`) must still
+ * resolve to its primitive type — otherwise the scenario filter bar
+ * mistypes a boolean field and offers numeric operators.
+ */
+
+import assert from "node:assert/strict"
+import {describe, it} from "node:test"
+
+import {resolveSchemaType} from "../schemaType"
+
+describe("resolveSchemaType", () => {
+    it("returns a plain string type", () => {
+        assert.equal(resolveSchemaType({type: "boolean"}), "boolean")
+        assert.equal(resolveSchemaType({type: "number"}), "number")
+    })
+
+    it("treats a bare 'null' type as no type", () => {
+        assert.equal(resolveSchemaType({type: "null"}), undefined)
+    })
+
+    it("unwraps a nullable array type — first non-null entry", () => {
+        assert.equal(resolveSchemaType({type: ["boolean", "null"]}), "boolean")
+        assert.equal(resolveSchemaType({type: ["null", "number"]}), "number")
+    })
+
+    it("unwraps a nullable anyOf / oneOf union", () => {
+        assert.equal(resolveSchemaType({anyOf: [{type: "boolean"}, {type: "null"}]}), "boolean")
+        assert.equal(resolveSchemaType({oneOf: [{type: "null"}, {type: "string"}]}), "string")
+    })
+
+    it("returns undefined when no type is resolvable", () => {
+        assert.equal(resolveSchemaType({}), undefined)
+        assert.equal(resolveSchemaType(null), undefined)
+        assert.equal(resolveSchemaType(undefined), undefined)
+        assert.equal(resolveSchemaType({anyOf: [{type: "null"}]}), undefined)
+    })
+})
diff --git a/web/packages/agenta-entities/src/workflow/core/evaluatorResolution.ts b/web/packages/agenta-entities/src/workflow/core/evaluatorResolution.ts
index 99474b7c7c..76c44cb3dd 100644
--- a/web/packages/agenta-entities/src/workflow/core/evaluatorResolution.ts
+++ b/web/packages/agenta-entities/src/workflow/core/evaluatorResolution.ts
@@ -14,6 +14,7 @@
 import {resolveSchemaRef} from "../../runnable/portHelpers"
 
 import {resolveOutputSchema, resolveOutputSchemaProperties} from "./schema"
+import {resolveSchemaType} from "./schemaType"
 
 // ============================================================================
 // TYPES
@@ -135,7 +136,7 @@ export const extractMetrics = (evaluator: {
             kind: "metric" as const,
             path: key,
             stepKey: evaluator.slug || evaluator.id || "metric",
-            metricType: typeof schema?.type === "string" ? schema.type : METRIC_TYPE_FALLBACK,
+            metricType: resolveSchemaType(schema) ?? METRIC_TYPE_FALLBACK,
             displayLabel: typeof schema?.title === "string" ? schema.title : titleize(key),
             description: typeof schema?.description === "string" ? schema.description : undefined,
         }
diff --git a/web/packages/agenta-entities/src/workflow/core/schemaType.ts b/web/packages/agenta-entities/src/workflow/core/schemaType.ts
new file mode 100644
index 0000000000..b50e37204d
--- /dev/null
+++ b/web/packages/agenta-entities/src/workflow/core/schemaType.ts
@@ -0,0 +1,39 @@
+/**
+ * resolveSchemaType — resolve a JSON-schema node's primitive type.
+ *
+ * Tolerates the nullable encodings an evaluator output schema may use:
+ *   - `type: "boolean"`                                    — plain
+ *   - `type: ["boolean", "null"]`                          — array / nullable
+ *   - `anyOf | oneOf: [{type: "boolean"}, {type: "null"}]` — union / nullable
+ *
+ * Returns the first non-`"null"` type found, or `undefined` when none is
+ * resolvable.
+ *
+ * Kept in its own dependency-free module so it is unit-testable without
+ * pulling in the evaluator-resolution import graph.
+ *
+ * @packageDocumentation
+ */
+
+export const resolveSchemaType = (
+    schema: Record<string, unknown> | null | undefined,
+): string | undefined => {
+    if (!schema || typeof schema !== "object") return undefined
+
+    const type = schema.type
+    if (typeof type === "string") return type === "null" ? undefined : type
+    if (Array.isArray(type)) {
+        const first = type.find((t) => typeof t === "string" && t !== "null")
+        if (typeof first === "string") return first
+    }
+
+    for (const key of ["anyOf", "oneOf"] as const) {
+        const branches = schema[key]
+        if (!Array.isArray(branches)) continue
+        for (const branch of branches) {
+            const resolved = resolveSchemaType(branch as Record<string, unknown>)
+            if (resolved) return resolved
+        }
+    }
+    return undefined
+}