Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
07e77df
docs: add eval-scenarios-table-integration design (eng + design revie…
ardaerzin May 21, 2026
d899b31
docs: drop T1 from eval-scenarios-table plan — store already thin
ardaerzin May 21, 2026
0cbb07f
docs: record T2 export-coupling finding from Table.tsx read
ardaerzin May 22, 2026
aac4b36
feat(oss): swap eval scenarios table to ETL run-graph columns + cells
ardaerzin May 22, 2026
9724541
doc(eval): close T4 filter composition — multi-predicate AND/OR (D8)
ardaerzin May 22, 2026
165c008
feat(entities): T4 core — multi-predicate AND/OR filter logic + filte…
ardaerzin May 22, 2026
5d8cc7a
fix(oss): restore scenario table columns dropped by the ETL swap
ardaerzin May 22, 2026
33efa7e
feat(oss): multi-predicate filtering in the eval scenarios table (T4)
ardaerzin May 22, 2026
5249e9e
feat(oss): scope v1 scenario filtering to metric-related columns
ardaerzin May 22, 2026
4efc57a
fix(oss): type scenario filter columns from the evaluator output schema
ardaerzin May 22, 2026
67ac220
fix(entities): resolve nullable evaluator output types in extractMetrics
ardaerzin May 22, 2026
e9abd35
chore(oss): log evaluator schema on scenario filter column select
ardaerzin May 22, 2026
e1289ae
fix(oss): match evaluator metric definitions by the bare output key
ardaerzin May 22, 2026
2df8e49
fix(oss): stop scenario filter flicker while gathering data
ardaerzin May 22, 2026
3cee585
fix(oss): materialize filtered scenario rows incrementally, no flicker
ardaerzin May 22, 2026
a1e950c
fix(oss): move scenario filter into a popover (observability pattern)
ardaerzin May 22, 2026
c314630
fix(oss): inline AND/OR connector in scenario filter rows
ardaerzin May 22, 2026
e80759e
fix(oss): align first scenario filter row with the rest
ardaerzin May 22, 2026
20a5274
fix(oss): widen scenario filter connector slot to fit And/Or
ardaerzin May 22, 2026
534dab7
fix(oss): show filter "scanning" only while actually working
ardaerzin May 22, 2026
2a2cc97
fix(oss): move the scenario filter into the run header row
ardaerzin May 22, 2026
cb69779
fix(oss): reposition the scenario filter per the updated mockup
ardaerzin May 22, 2026
842bc51
chore: retire the ETL scenarios PoC (T7)
ardaerzin May 22, 2026
39cf81b
feat(oss): add in / not-in operators to the scenario filter
ardaerzin May 22, 2026
27db0ff
feat(oss): load compare runs eagerly when filtering scenarios (T5)
ardaerzin May 22, 2026
4d452c4
feat(oss): live updates for the ETL scenarios table (T6)
ardaerzin May 22, 2026
1db1db4
docs: record T8 co-consumer verification (D9)
ardaerzin May 22, 2026
8cb6bd1
feat(oss): live-refresh scenario steps and metrics while a run executes
ardaerzin May 22, 2026
54c80b5
Merge branch 'fe-experiment/etl-batch-add-traces' into fe-experiment/…
ardaerzin May 25, 2026
0d36c9a
Merge branch 'fe-experiment/etl-batch-add-traces' into fe-experiment/…
bekossy May 26, 2026
3f86613
fix(frontend): prevent removing the last filter condition in Scenario…
ardaerzin May 26, 2026
49a7324
Merge branch 'fe-experiment/etl-batch-add-traces' into fe-experiment/…
bekossy May 26, 2026
d707f84
Merge branch 'fe-experiment/etl-batch-add-traces' into fe-experiment/…
bekossy May 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
314 changes: 314 additions & 0 deletions docs/designs/eval-scenarios-table-integration.md

Large diffs are not rendered by default.

329 changes: 256 additions & 73 deletions web/oss/src/components/EvalRunDetails/Table.tsx

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions web/oss/src/components/EvalRunDetails/atoms/metrics.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {getProjectValues} from "@/oss/state/project"
import {previewEvalTypeAtom} from "../state/evalType"
import {resolveValueBySegments, splitPath} from "../utils/valueAccess"

import {isTerminalStatus} from "./compare"
import {
createMetricProcessor,
isLegacyValueLeaf,
Expand Down Expand Up @@ -782,13 +783,23 @@ export const evaluationMetricQueryAtomFamily = atomFamily(
const batcher = get(evaluationMetricBatcherFamily({runId: effectiveRunId}))
const projectId = resolveProjectId(get)

// While the run is still executing, poll so a completing
// scenario's metrics surface in the table cells + focus drawer
// without a manual reload. Stops once the run is terminal.
const runQuery = effectiveRunId
? get(evaluationRunQueryAtomFamily(effectiveRunId))
: undefined
const runStatus = runQuery?.data?.rawRun?.status ?? runQuery?.data?.camelRun?.status
const runTerminal = isTerminalStatus(runStatus)

return {
queryKey: ["preview", "evaluation-metric", effectiveRunId, projectId, scenarioId],
enabled: Boolean(projectId && effectiveRunId && batcher && scenarioId),
staleTime: 30_000,
gcTime: 5 * 60 * 1000,
refetchOnWindowFocus: false,
refetchOnReconnect: false,
refetchInterval: runTerminal ? false : 5000,
// Enable structural sharing to prevent unnecessary re-renders when data hasn't changed
structuralSharing: true,
queryFn: async () => {
Expand Down
12 changes: 12 additions & 0 deletions web/oss/src/components/EvalRunDetails/atoms/scenarioSteps.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ import type {IStepResponse} from "@/oss/lib/evaluations"
import {snakeToCamelCaseKeys} from "@/oss/lib/helpers/casing"
import {getProjectValues} from "@/oss/state/project"

import {isTerminalStatus} from "./compare"
import {activePreviewRunIdAtom, effectiveProjectIdAtom} from "./run"
import {evaluationRunQueryAtomFamily} from "./table/run"
import type {ScenarioStepsBatchResult} from "./types"

const scenarioStepsBatcherCache = new Map<string, BatchFetcher<string, ScenarioStepsBatchResult>>()
Expand Down Expand Up @@ -128,11 +130,21 @@ export const scenarioStepsQueryFamily = atomFamily(
const effectiveRunId = resolveEffectiveRunId(get, runId)
const batcher = get(scenarioStepsBatcherFamily({runId: effectiveRunId}))

// While the run is still executing, poll so the focus drawer /
// scenario viewer pick up a scenario's results as it completes.
// Stops once the run is terminal.
const runQuery = effectiveRunId
? get(evaluationRunQueryAtomFamily(effectiveRunId))
: undefined
const runStatus = runQuery?.data?.rawRun?.status ?? runQuery?.data?.camelRun?.status
const runTerminal = isTerminalStatus(runStatus)

return {
queryKey: ["preview", "scenario-steps", effectiveRunId, scenarioId],
enabled: Boolean(effectiveRunId && batcher && scenarioId),
refetchOnWindowFocus: false,
refetchOnReconnect: false,
refetchInterval: runTerminal ? false : 5000,
staleTime: 30_000,
gcTime: 5 * 60 * 1000,
// Enable structural sharing to prevent unnecessary re-renders when data hasn't changed
Expand Down
12 changes: 11 additions & 1 deletion web/oss/src/components/EvalRunDetails/atoms/table/columns.ts
Original file line number Diff line number Diff line change
Expand Up @@ -499,9 +499,19 @@ const tableColumnsBaseAtomFamily = atomFamily((runId: string | null) =>
}

const evaluator = column.evaluatorId ? evaluatorById.get(column.evaluatorId) : undefined
// Match the evaluator's metric definition by the canonical
// metric key (e.g. "attributes.ag.data.outputs.score") OR the
// bare value key (e.g. "score"). `extractMetrics` keys metrics
// by the output-schema property name — the bare key — so a
// canonical-key-only match misses and `metricType` falls back
// to "string", mis-typing the column (e.g. a boolean output).
const metricKey = column.metricKey || column.valueKey
const metricDefinition = evaluator?.metrics.find(
(metric) => metric.name === metricKey || metric.path === metricKey,
(metric) =>
metric.name === metricKey ||
metric.path === metricKey ||
metric.name === column.valueKey ||
metric.path === column.valueKey,
)
const metricType =
metricDefinition?.metricType || column.metricType || METRIC_TYPE_FALLBACK
Expand Down
2 changes: 1 addition & 1 deletion web/oss/src/components/EvalRunDetails/components/Page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ const EvalRunPreviewPage = ({runId, evaluationType, projectId = null}: EvalRunPr
headerClassName="px-4 pt-2"
>
<div className="flex h-full min-h-0 flex-col gap-2 [&_.ant-tabs-content]:h-full [&_.ant-tabs-tabpane]:h-full">
<PreviewEvalRunMeta runId={runId} projectId={projectId} />
<PreviewEvalRunMeta runId={runId} projectId={projectId} activeView={activeView} />
<Tabs
className="flex-1 min-h-0 overflow-hidden"
activeKey={activeView}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {
runTestsetIdsAtomFamily,
runFlagsAtomFamily,
} from "../atoms/runDerived"
import ScenarioFilterBar from "../etl/ScenarioFilterBar"
import {previewEvalTypeAtom} from "../state/evalType"

import CompareRunsMenu from "./CompareRunsMenu"
Expand Down Expand Up @@ -137,10 +138,12 @@ const PreviewEvalRunMeta = ({
runId,
projectId,
className,
activeView,
}: {
runId: string
projectId?: string | null
className?: string
activeView?: ActiveView
}) => {
const _invocationRefs = useAtomValue(useMemo(() => runInvocationRefsAtomFamily(runId), [runId]))
const _testsetIds = useAtomValue(useMemo(() => runTestsetIdsAtomFamily(runId), [runId]))
Expand Down Expand Up @@ -220,6 +223,7 @@ const PreviewEvalRunMeta = ({
</Button>
</Tooltip>
) : null}
{activeView === "scenarios" ? <ScenarioFilterBar runId={runId} /> : null}
<CompareRunsMenu runId={runId} />
</div>
</div>
Expand Down
74 changes: 74 additions & 0 deletions web/oss/src/components/EvalRunDetails/etl/EtlColumnHeader.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/**
* EtlColumnHeader
*
* Renders the nested-header label for a column group. The default
* `computeColumnGroup` resolver falls back to `Testset <slug>` /
* `Application <slug>` because it doesn't fetch the entity itself.
*
* This header is that override — same pattern production's
* `StepGroupHeader` uses: subscribe to the entity reference atom by ID
* and surface the entity's name when available, fall back to the slug
* otherwise. Evaluator + metrics + other groups already carry
* `slugToTitle`-rendered labels, so no entity lookup is needed.
*/

import {useMemo} from "react"

import type {ColumnGroup} from "@agenta/entities/evaluationRun/etl"
import {Tooltip} from "antd"
import {atom, useAtomValue} from "jotai"

import {
applicationReferenceQueryAtomFamily,
testsetReferenceQueryAtomFamily,
} from "../atoms/references"

const emptyAtom = atom<{data: {name?: string; slug?: string} | null} | null>(null)

interface EtlColumnHeaderProps {
group: ColumnGroup
}

const pickName = (entity: unknown): string | null => {
if (!entity || typeof entity !== "object") return null
const name = (entity as {name?: unknown}).name
return typeof name === "string" && name.length > 0 ? name : null
}

const EtlColumnHeader = ({group}: EtlColumnHeaderProps) => {
const refAtom = useMemo(() => {
if (group.kind === "testset") {
const id = (group.refs?.testset as {id?: string} | undefined)?.id
return id ? testsetReferenceQueryAtomFamily(id) : emptyAtom
}
if (group.kind === "application") {
const id = (group.refs?.application as {id?: string} | undefined)?.id
return id ? applicationReferenceQueryAtomFamily(id) : emptyAtom
}
return emptyAtom
}, [group])

const ref = useAtomValue(refAtom) as {data?: unknown} | null
const name = pickName(ref?.data ?? null)

const label = useMemo(() => {
switch (group.kind) {
case "testset":
return name ? `Testset ${name}` : group.label
case "application":
return name ? `Application ${name}` : group.label
default:
return group.label
}
}, [group.kind, group.label, name])

return (
<Tooltip title={label} placement="top">
<span className="block max-w-full overflow-hidden text-ellipsis whitespace-nowrap text-left">
{label}
</span>
</Tooltip>
)
}

export default EtlColumnHeader
Loading
Loading