From 39d89a7af41037d9acbf408cec12bc85969e2281 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 2 May 2026 09:35:50 -0400 Subject: [PATCH 1/5] Site tier-1: open-set banner, sensitivity selector, bootstrap intervals Adds the credibility-tightening tier-1 leaderboard changes from docs/site_improvements_scope.md, plus a shared sticky header that the paper page now reuses. Header - Extract SiteHeader from Hero. The new component owns the sticky brand + nav + view-selector + action-link layout and supports an alwaysExpanded mode for pages without an in-page hero. - Hero refactored to wrap SiteHeader and pass the country-aware subtitle, stat strip, and snapshot pill as expandedContent. Drop the "Top score" stat and the "Leading: " sidebar; the leaderboard itself is the canonical source for both. - /paper uses SiteHeader with alwaysExpanded, no view selector, and a Benchmark action link. The page body keeps its eyebrow/buttons/iframe. Open-set banner + snapshot pill - Above the leaderboard, a warning-tinted note states that scenarios and reference outputs are public, so the public preview is open-set. - Snapshot date pill (Snapshot 2026-05-01) appears in the hero stat row on the home page and next to the Manuscript eyebrow on /paper. Sensitivity-view selector - New segmented control with five views: Main, Amount only, Binary only, Positive cases, Zero cases. Selecting a view rescores models client-side from scenarioPredictions and reorders the leaderboard; the description for the active view appears next to the selector. - New utilities under app/src/lib/: - scoring.ts ports score_single_prediction (mean of exact, within-1%, within-5%, within-10% for amount; classification accuracy for binary; output-group resolution for person-expanded variables). Verified against canonical analysis.py against the snapshot for both US and UK headline scopes. - sensitivity.ts builds the per-row score table from a DashboardBundle and aggregates output-group means -> country -> global, preserving the country-equal weighting. Sensitivity views filter rows before aggregation. Bootstrap rank intervals - bootstrap.ts implements the household-resampling bootstrap with a deterministic mulberry32 RNG (seed 42, 400 draws) and reports the 95% score interval and the rank range for each model under the active sensitivity view. - ModelLeaderboard renders Rank N(-M) - 95% L-U next to each model's point estimate, with a tooltip naming the bootstrap parameters. Repo - Move the python wheel-artifact lib/ rule in .gitignore to /lib/ and /lib64/ (top-level only) so app/src/lib/ is tracked. Verification - bun run lint - clean - bun run build - clean (Next.js 16 production build) - bun run start - SSR render of / contains the open-set banner, the snapshot pill, the five sensitivity selector chips, and per-model Rank/95% interval rows for all 12 models. /paper renders SiteHeader with the snapshot pill and Benchmark action link, no view selector. --- .gitignore | 4 +- app/src/App.tsx | 6 +- app/src/app/paper/page.tsx | 84 +++---- app/src/components/Hero.tsx | 301 +++++------------------- app/src/components/ModelLeaderboard.tsx | 128 +++++++++- app/src/components/SiteHeader.tsx | 260 ++++++++++++++++++++ app/src/lib/bootstrap.ts | 201 ++++++++++++++++ app/src/lib/scoring.ts | 100 ++++++++ app/src/lib/sensitivity.ts | 211 +++++++++++++++++ 9 files changed, 992 insertions(+), 303 deletions(-) create mode 100644 app/src/components/SiteHeader.tsx create mode 100644 app/src/lib/bootstrap.ts create mode 100644 app/src/lib/scoring.ts create mode 100644 app/src/lib/sensitivity.ts diff --git a/.gitignore b/.gitignore index 5a93dfc..bb3b31f 100644 --- a/.gitignore +++ b/.gitignore @@ -33,8 +33,8 @@ dist/ downloads/ eggs/ .eggs/ -lib/ -lib64/ +/lib/ +/lib64/ parts/ sdist/ var/ diff --git a/app/src/App.tsx b/app/src/App.tsx index 3957843..b006413 100644 --- a/app/src/App.tsx +++ b/app/src/App.tsx @@ -124,7 +124,11 @@ export default function App() {
- +
{!isGlobal && ( diff --git a/app/src/app/paper/page.tsx b/app/src/app/paper/page.tsx index 4d45f49..9f20a38 100644 --- a/app/src/app/paper/page.tsx +++ b/app/src/app/paper/page.tsx @@ -1,6 +1,10 @@ /* eslint-disable @next/next/no-img-element */ import Link from "next/link"; +import SiteHeader from "../../components/SiteHeader"; + +const SNAPSHOT_DATE_LABEL = "Snapshot 2026-05-01"; + const manuscriptPaths = { pdf: "/paper/policybench.pdf", web: "/paper/web/index.html?v=20260501", @@ -8,63 +12,39 @@ const manuscriptPaths = { const ssrnUrl = process.env.NEXT_PUBLIC_POLICYBENCH_SSRN_URL; export default function PaperPage() { + const expanded = ( + <> +

+ Benchmarking no-tool tax-and-benefit estimation in frontier language + models. This page embeds the frozen 2026-05-01 manuscript snapshot: + a 100-household-per-country public preview scored against + PolicyEngine reference outputs. +

+
+ + + {SNAPSHOT_DATE_LABEL} + +
+ + ); + return (
- +
Manuscript
-

- PolicyBench -

-

- Benchmarking no-tool tax-and-benefit estimation in frontier language - models. This page embeds the frozen 2026-05-01 manuscript snapshot: - a 100-household-per-country public preview scored against - PolicyEngine reference outputs. -

-
+
PolicyEngineResearch paper by PolicyEngine
-
+
{ssrnUrl && ( void; - views: ViewKey[]; - compact?: boolean; -}) { - const pill = compact - ? "rounded-full text-[10px] px-2.5 py-1 font-medium transition-colors" - : "rounded-full px-3 py-1.5 text-xs font-medium transition-colors sm:px-4"; - return ( -
- {views.map((view) => ( - - ))} -
- ); -} - -type NavItem = { id: string; label: string }; - -/** Returns 0 at top, 1 when fully collapsed. Smooth continuous value. */ -function getScrollProgress(threshold: number) { - if (typeof window === "undefined") return 0; - return Math.min(1, Math.max(0, window.scrollY / threshold)); -} - -function useScrollProgress(threshold = 80) { - const [progress, setProgress] = useState(() => getScrollProgress(threshold)); - const rafRef = useRef(0); - - useEffect(() => { - const onScroll = () => { - cancelAnimationFrame(rafRef.current); - rafRef.current = requestAnimationFrame(() => { - setProgress(getScrollProgress(threshold)); - }); - }; - window.addEventListener("scroll", onScroll, { passive: true }); - return () => { - window.removeEventListener("scroll", onScroll); - cancelAnimationFrame(rafRef.current); - }; - }, [threshold]); - - return progress; -} +const SNAPSHOT_DATE_LABEL = "Snapshot 2026-05-01"; export default function Hero({ selectedView, @@ -87,22 +22,21 @@ export default function Hero({ dashboard: DashboardBundle; data: BenchData | GlobalBenchData; availableViews: ViewKey[]; - navItems: readonly NavItem[]; + navItems: readonly HeaderNavItem[]; activeNav: string; }) { - const progress = useScrollProgress(80); - const scrolled = progress > 0.5; - const isGlobal = selectedView === "global"; const benchData = isGlobal ? null : (data as BenchData); const rankedNoTools = [...data.modelStats] .filter((m) => m.condition === "no_tools") .sort((a, b) => b.score - a.score); - const leadModel = rankedNoTools[0]; const countryHouseholds = Object.values(dashboard.countries).map( - (country) => Object.keys(country?.scenarios ?? {}).length + (country) => Object.keys(country?.scenarios ?? {}).length, + ); + const totalHouseholds = countryHouseholds.reduce( + (sum, count) => sum + count, + 0, ); - const totalHouseholds = countryHouseholds.reduce((sum, count) => sum + count, 0); const countryCount = countryHouseholds.length; const subtitle = isGlobal @@ -111,183 +45,70 @@ export default function Hero({ const stats = isGlobal ? [ - { value: `${leadModel?.score.toFixed(1) ?? "0.0"}%`, label: "Top score" }, { value: `${countryCount}`, label: "Countries" }, - { value: `${(data as GlobalBenchData).sharedModelCount}`, label: "Models" }, - { value: `${totalHouseholds.toLocaleString()}`, label: "Households" }, + { + value: `${(data as GlobalBenchData).sharedModelCount}`, + label: "Models", + }, + { + value: `${totalHouseholds.toLocaleString()}`, + label: "Households", + }, ] : [ - { value: `${leadModel?.score.toFixed(1) ?? "0.0"}%`, label: "Top score" }, { value: `${rankedNoTools.length}`, label: "Models" }, - { value: `${Object.keys(benchData!.scenarios).length.toLocaleString()}`, label: "Households" }, + { + value: `${Object.keys(benchData!.scenarios).length.toLocaleString()}`, + label: "Households", + }, { value: `${benchData!.programStats.length}`, label: "Outputs" }, ]; - // Continuous interpolation helpers - const lerp = (a: number, b: number) => a + (b - a) * progress; - const expandedPadTop = lerp(40, 8); // pt-10 → py-2 - const expandedPadBot = lerp(16, 8); - const titleSize = lerp(36, 16); // text-4xl → text-base - const expandOpacity = 1 - Math.min(1, progress * 2); // fade out faster - const expandHeight = `${(1 - progress) * 140}px`; - const navOpacity = Math.max(0, (progress - 0.3) / 0.7); // fade in after 30% - const bgOpacity = progress; - - return ( -
- {/* Background — fades in */} -
- - {/* Gradient glow — fades out */} -
- -
- {/* Top row: brand + nav + view selector */} -
- - - PolicyBench - - - - {/* Nav tabs — fade in as you scroll */} -
0.05 ? "600px" : "0px", - marginLeft: navOpacity > 0.05 ? "4px" : "0px", - }} - > -
-
- {navItems.map((item) => ( - - {item.label} - - ))} + const expanded = ( + <> +

+ {subtitle}{" "} + + 100% = exact answers across the full benchmark. + +

+ +
+
+ {stats.map((stat, i) => ( +
+ + {stat.value} + + + {stat.label} + + {i < stats.length - 1 && ( + + / + + )}
-
- -
- {/* Expanded content: subtitle + stats */} -
0.05 ? `${lerp(32, 0)}px` : "0px", - }} - > -

- {subtitle}{" "} - - 100% = exact answers across the full benchmark. - -

- -
-
- {stats.map((stat, i) => ( -
- - {stat.value} - - - {stat.label} - - {i < stats.length - 1 && ( - - / - - )} -
- ))} -
- - {leadModel && ( -
- Leading: - - {MODEL_LABELS[leadModel.model] ?? leadModel.model} - -
- )} -
-
+ + + {SNAPSHOT_DATE_LABEL} +
+ + ); - {/* Bottom border gradient — fades out */} -
-
+ return ( + ); } diff --git a/app/src/components/ModelLeaderboard.tsx b/app/src/components/ModelLeaderboard.tsx index 6537f84..a680bfc 100644 --- a/app/src/components/ModelLeaderboard.tsx +++ b/app/src/components/ModelLeaderboard.tsx @@ -1,6 +1,7 @@ -import { useMemo } from "react"; +import { useMemo, useState } from "react"; import type { BenchData, + DashboardBundle, GlobalBenchData, ModelStat, ViewKey, @@ -12,6 +13,13 @@ import { getProviderForModel, } from "../modelMeta"; import ProviderMark from "./ProviderMark"; +import { + SENSITIVITY_VIEWS, + buildAllRows, + modelScoresForView, + type SensitivityViewId, +} from "../lib/sensitivity"; +import { bootstrapIntervals, viewToFilter } from "../lib/bootstrap"; function Badge({ children, @@ -97,18 +105,50 @@ const PENDING_MODELS: Record = { export default function ModelLeaderboard({ data, selectedView, + dashboard, }: { data: BenchData | GlobalBenchData; selectedView: ViewKey; + dashboard: DashboardBundle; }) { const isGlobal = selectedView === "global"; - const noTools = useMemo( - () => - data.modelStats - .filter((m) => m.condition === "no_tools") - .sort((a, b) => b.score - a.score), - [data] - ); + const [sensitivityView, setSensitivityView] = + useState("main"); + + const allRows = useMemo(() => buildAllRows(dashboard), [dashboard]); + + const sensitivityScores = useMemo(() => { + return modelScoresForView(allRows, sensitivityView, selectedView); + }, [allRows, sensitivityView, selectedView]); + + const sensitivityScoreByModel = useMemo(() => { + const out = new Map(); + for (const entry of sensitivityScores) out.set(entry.model, entry.score); + return out; + }, [sensitivityScores]); + + const noTools = useMemo(() => { + const base = data.modelStats.filter((m) => m.condition === "no_tools"); + if (sensitivityView === "main") { + return [...base].sort((a, b) => b.score - a.score); + } + // Reorder + replace score with the sensitivity-view score, dropping models + // that don't have a score under this slice. + return base + .filter((m) => sensitivityScoreByModel.has(m.model)) + .map((m) => ({ ...m, score: sensitivityScoreByModel.get(m.model)! })) + .sort((a, b) => b.score - a.score); + }, [data, sensitivityView, sensitivityScoreByModel]); + + const intervals = useMemo(() => { + return bootstrapIntervals( + allRows, + selectedView, + viewToFilter(sensitivityView), + { draws: 400, seed: 42 }, + ); + }, [allRows, selectedView, sensitivityView]); + const pendingModels = useMemo(() => { const present = new Set(noTools.map((model) => model.model)); const configured = PENDING_MODELS[selectedView].filter( @@ -121,6 +161,8 @@ export default function ModelLeaderboard({ }); }, [noTools, selectedView]); + const activeView = SENSITIVITY_VIEWS.find((v) => v.id === sensitivityView)!; + return (
Leaderboard
@@ -145,6 +187,53 @@ export default function ModelLeaderboard({ )}

+
+ +

+ Open-set leaderboard. The + public scenario explorer exposes prompts and PolicyEngine reference + outputs, so future model releases or fine-tunes could learn from the + released cases. Treat this as a public preview; protected + held-out claims would require a separate rotating evaluation set. +

+
+ +
+ + View + +
+ {SENSITIVITY_VIEWS.map((view) => ( + + ))} +
+ + {activeView.description} + +
+
)} + {rankRange && ( +
+ {rankRange} · 95% {scoreRange} +
+ )}
{m.score.toFixed(1)}% @@ -235,6 +339,14 @@ export default function ModelLeaderboard({ {m.score.toFixed(1)}% + {rankRange && ( +
+ {rankRange} · 95% {scoreRange} +
+ )} {!isGlobal && stabilityLabel && (
{stabilityLabel} diff --git a/app/src/components/SiteHeader.tsx b/app/src/components/SiteHeader.tsx new file mode 100644 index 0000000..b29013f --- /dev/null +++ b/app/src/components/SiteHeader.tsx @@ -0,0 +1,260 @@ +/* eslint-disable @next/next/no-img-element */ +"use client"; + +import Link from "next/link"; +import { useEffect, useRef, useState } from "react"; + +import type { ViewKey } from "../types"; +import { VIEW_LABELS } from "../types"; + +export type HeaderNavItem = { id: string; label: string }; + +export type HeaderActionLink = { + label: string; + href: string; + type?: "internal" | "external"; +}; + +function ViewSelector({ + selectedView, + onSelect, + views, + compact, +}: { + selectedView: ViewKey; + onSelect: (view: ViewKey) => void; + views: ViewKey[]; + compact?: boolean; +}) { + const pill = compact + ? "rounded-full text-[10px] px-2.5 py-1 font-medium transition-colors" + : "rounded-full px-3 py-1.5 text-xs font-medium transition-colors sm:px-4"; + return ( +
+ {views.map((view) => ( + + ))} +
+ ); +} + +function getScrollProgress(threshold: number) { + if (typeof window === "undefined") return 0; + return Math.min(1, Math.max(0, window.scrollY / threshold)); +} + +function useScrollProgress(threshold = 80) { + const [progress, setProgress] = useState(() => getScrollProgress(threshold)); + const rafRef = useRef(0); + + useEffect(() => { + const onScroll = () => { + cancelAnimationFrame(rafRef.current); + rafRef.current = requestAnimationFrame(() => { + setProgress(getScrollProgress(threshold)); + }); + }; + window.addEventListener("scroll", onScroll, { passive: true }); + return () => { + window.removeEventListener("scroll", onScroll); + cancelAnimationFrame(rafRef.current); + }; + }, [threshold]); + + return progress; +} + +export type SiteHeaderProps = { + navItems?: readonly HeaderNavItem[]; + activeNav?: string; + selectedView?: ViewKey; + onSelectView?: (view: ViewKey) => void; + availableViews?: ViewKey[]; + actionLink?: HeaderActionLink; + expandedContent?: React.ReactNode; + /** + * When true, the header always renders in its expanded state. Used on pages + * (e.g. /paper) where we don't have an in-page hero to drive the collapse. + */ + alwaysExpanded?: boolean; +}; + +export default function SiteHeader({ + navItems = [], + activeNav, + selectedView, + onSelectView, + availableViews, + actionLink, + expandedContent, + alwaysExpanded = false, +}: SiteHeaderProps) { + const measuredProgress = useScrollProgress(80); + const progress = alwaysExpanded ? 0 : measuredProgress; + const scrolled = progress > 0.5; + + const lerp = (a: number, b: number) => a + (b - a) * progress; + const expandedPadTop = lerp(40, 8); + const expandedPadBot = lerp(16, 8); + const titleSize = lerp(36, 16); + const expandOpacity = 1 - Math.min(1, progress * 2); + const expandHeight = `${(1 - progress) * 320}px`; + const navOpacity = Math.max(0, (progress - 0.3) / 0.7); + const bgOpacity = progress; + + const showViewSelector = + availableViews && availableViews.length > 0 && selectedView && onSelectView; + + return ( +
+
+ +
+ +
+
+ + + PolicyBench + + + + {navItems.length > 0 && ( +
0.05 ? "600px" : "0px", + marginLeft: navOpacity > 0.05 ? "4px" : "0px", + }} + > +
+
+ {navItems.map((item) => ( + + {item.label} + + ))} +
+
+ )} + +
+ + {showViewSelector && ( + + )} + + {actionLink && ( +
0.05 ? "120px" : "0px", + }} + > + {actionLink.type === "external" ? ( + + {actionLink.label} + + ) : ( + + {actionLink.label} + + )} +
+ )} + + + by + PolicyEngine + +
+ + {expandedContent && ( +
0.05 + ? `${alwaysExpanded ? 32 : lerp(32, 0)}px` + : "0px", + }} + > + {expandedContent} +
+ )} +
+ +
+
+ ); +} diff --git a/app/src/lib/bootstrap.ts b/app/src/lib/bootstrap.ts new file mode 100644 index 0000000..3861c7e --- /dev/null +++ b/app/src/lib/bootstrap.ts @@ -0,0 +1,201 @@ +import type { CountryCode, ViewKey } from "../types"; +import type { ScoreRow } from "./scoring"; +import { type SensitivityViewId } from "./sensitivity"; + +const DEFAULT_DRAWS = 500; + +function mulberry32(seed: number): () => number { + let state = seed >>> 0; + return () => { + state = (state + 0x6d2b79f5) >>> 0; + let t = state; + t = Math.imul(t ^ (t >>> 15), t | 1); + t ^= t + Math.imul(t ^ (t >>> 7), t | 61); + return ((t ^ (t >>> 14)) >>> 0) / 4294967296; + }; +} + +type ModelScenarioOutputBuckets = Map< + string, // model + Map< + CountryCode, + Map< + string, // scenarioId + Map // outputGroup -> sum/count + > + > +>; + +function bucketize(rows: ScoreRow[]): ModelScenarioOutputBuckets { + const buckets: ModelScenarioOutputBuckets = new Map(); + for (const row of rows) { + let countryMap = buckets.get(row.model); + if (!countryMap) { + countryMap = new Map(); + buckets.set(row.model, countryMap); + } + let scenarioMap = countryMap.get(row.country); + if (!scenarioMap) { + scenarioMap = new Map(); + countryMap.set(row.country, scenarioMap); + } + let outputMap = scenarioMap.get(row.scenarioId); + if (!outputMap) { + outputMap = new Map(); + scenarioMap.set(row.scenarioId, outputMap); + } + const cur = outputMap.get(row.outputGroup) ?? { sum: 0, count: 0 }; + cur.sum += row.score * 100; + cur.count += 1; + outputMap.set(row.outputGroup, cur); + } + return buckets; +} + +export type BootstrapInterval = { + lower: number; + upper: number; + rankLower: number; + rankUpper: number; +}; + +export function bootstrapIntervals( + rows: ScoreRow[], + selectedView: ViewKey, + filterFn: (row: ScoreRow) => boolean, + options: { draws?: number; seed?: number } = {}, +): Map { + const draws = options.draws ?? DEFAULT_DRAWS; + const seed = options.seed ?? 42; + const filtered = rows.filter(filterFn); + const buckets = bucketize(filtered); + + // Per-country scenario universe. + const perCountryScenarios = new Map(); + for (const countryMap of buckets.values()) { + for (const [country, scenarioMap] of countryMap) { + const list = perCountryScenarios.get(country) ?? []; + for (const scenarioId of scenarioMap.keys()) { + if (!list.includes(scenarioId)) list.push(scenarioId); + } + perCountryScenarios.set(country, list); + } + } + for (const list of perCountryScenarios.values()) list.sort(); + + const countriesToUse: CountryCode[] = + selectedView === "global" + ? (["us", "uk"] as CountryCode[]).filter((c) => + perCountryScenarios.has(c), + ) + : [selectedView as CountryCode]; + + const models = [...buckets.keys()]; + const rng = mulberry32(seed); + + const drawScores: Record = {}; + for (const model of models) drawScores[model] = []; + const rankSamples: Record = {}; + for (const model of models) rankSamples[model] = []; + + for (let draw = 0; draw < draws; draw += 1) { + // Sample scenario ids per country with replacement. + const sampledIds = new Map(); + for (const country of countriesToUse) { + const ids = perCountryScenarios.get(country); + if (!ids || ids.length === 0) continue; + const sampled: string[] = []; + for (let i = 0; i < ids.length; i += 1) { + sampled.push(ids[Math.floor(rng() * ids.length)]); + } + sampledIds.set(country, sampled); + } + + const scoreThisDraw: Record = {}; + for (const model of models) { + const countryMap = buckets.get(model)!; + const countryScores: number[] = []; + for (const country of countriesToUse) { + const scenarioMap = countryMap.get(country); + if (!scenarioMap) continue; + const sampled = sampledIds.get(country) ?? []; + // Aggregate output-group means across the sampled scenarios. + const outputBuckets = new Map(); + for (const scenarioId of sampled) { + const outputMap = scenarioMap.get(scenarioId); + if (!outputMap) continue; + for (const [outputGroup, v] of outputMap) { + const cur = outputBuckets.get(outputGroup) ?? { + sum: 0, + count: 0, + }; + // Each scenario contributes its mean for that output_group. + cur.sum += v.sum / v.count; + cur.count += 1; + outputBuckets.set(outputGroup, cur); + } + } + if (outputBuckets.size === 0) continue; + let totalGroupMean = 0; + let groupCount = 0; + for (const v of outputBuckets.values()) { + if (v.count === 0) continue; + totalGroupMean += v.sum / v.count; + groupCount += 1; + } + if (groupCount > 0) countryScores.push(totalGroupMean / groupCount); + } + if (countryScores.length === countriesToUse.length) { + scoreThisDraw[model] = + countryScores.reduce((a, b) => a + b, 0) / countryScores.length; + } + } + + const ranked = Object.entries(scoreThisDraw).sort( + (a, b) => b[1] - a[1], + ); + for (let i = 0; i < ranked.length; i += 1) { + const [model, score] = ranked[i]; + drawScores[model].push(score); + rankSamples[model].push(i + 1); + } + } + + const out = new Map(); + for (const model of models) { + const scores = drawScores[model].sort((a, b) => a - b); + const ranks = rankSamples[model]; + if (scores.length === 0) continue; + const lowerIndex = Math.floor(scores.length * 0.025); + const upperIndex = Math.min( + scores.length - 1, + Math.ceil(scores.length * 0.975) - 1, + ); + out.set(model, { + lower: scores[lowerIndex], + upper: scores[upperIndex], + rankLower: Math.min(...ranks), + rankUpper: Math.max(...ranks), + }); + } + return out; +} + +export function viewToFilter( + view: SensitivityViewId, +): (row: ScoreRow) => boolean { + switch (view) { + case "main": + return () => true; + case "amount_only": + return (row) => row.metricType === "amount"; + case "binary_only": + return (row) => row.metricType === "binary"; + case "positive_only": + return (row) => row.truth !== 0; + case "zero_only": + return (row) => row.truth === 0; + default: + return () => true; + } +} diff --git a/app/src/lib/scoring.ts b/app/src/lib/scoring.ts new file mode 100644 index 0000000..6cce63d --- /dev/null +++ b/app/src/lib/scoring.ts @@ -0,0 +1,100 @@ +import { isBinaryVariable, type CountryCode } from "../types"; + +export type ScoreRow = { + country: CountryCode; + scenarioId: string; + variable: string; + outputGroup: string; + model: string; + truth: number; + prediction: number | null | undefined; + metricType: "amount" | "binary"; + score: number; +}; + +const PERSON_OUTPUT_PREFIXES = [ + "head", + "spouse", + "adult1", + "adult2", + "adult3", + "adult4", + "adult5", + "child1", + "child2", + "child3", + "child4", + "child5", + "dependent1", + "dependent2", + "dependent3", +] as const; + +const PERSON_OUTPUT_SUFFIXES = [ + "wic", + "medicaid", + "chip", + "medicare", + "head_start", + "early_head_start", +] as const; + +export function outputGroupForVariable(variable: string): string { + const match = variable.match( + /^(head|spouse|adult\d+|child\d+|dependent\d+)_(wic|medicaid|chip|medicare|head_start|early_head_start)_eligible$/, + ); + if (match) { + return `person_${match[2]}_eligible`; + } + // Already grouped or not a person-expanded variable. + return variable; +} + +export function metricTypeForVariable( + variable: string, + country: CountryCode, +): "amount" | "binary" { + if (isBinaryVariable(variable, country)) return "binary"; + const match = variable.match( + /^(head|spouse|adult\d+|child\d+|dependent\d+)_(wic|medicaid|chip|medicare|head_start|early_head_start)_eligible$/, + ); + if (match) return "binary"; + return "amount"; +} + +function within(truth: number, prediction: number, tolerance: number): number { + if (truth === 0) { + return Math.abs(prediction) <= 1.0 ? 1 : 0; + } + return Math.abs(prediction - truth) / Math.abs(truth) <= tolerance ? 1 : 0; +} + +function exactAmount(truth: number, prediction: number): number { + return Math.abs(prediction - truth) <= 1.0 ? 1 : 0; +} + +export function scorePrediction( + variable: string, + country: CountryCode, + truth: number, + prediction: number | null | undefined, +): number { + if (prediction === null || prediction === undefined || Number.isNaN(prediction)) { + return 0; + } + const metricType = metricTypeForVariable(variable, country); + if (metricType === "binary") { + return Math.round(prediction) === Math.round(truth) ? 1 : 0; + } + const exact = exactAmount(truth, prediction); + const w1 = within(truth, prediction, 0.01); + const w5 = within(truth, prediction, 0.05); + const w10 = within(truth, prediction, 0.1); + return (exact + w1 + w5 + w10) / 4; +} + +// Touch the prefix/suffix tables so a future test can verify coverage. +export const PERSON_OUTPUT_PREFIX_LIST: readonly string[] = + PERSON_OUTPUT_PREFIXES; +export const PERSON_OUTPUT_SUFFIX_LIST: readonly string[] = + PERSON_OUTPUT_SUFFIXES; diff --git a/app/src/lib/sensitivity.ts b/app/src/lib/sensitivity.ts new file mode 100644 index 0000000..e358d6c --- /dev/null +++ b/app/src/lib/sensitivity.ts @@ -0,0 +1,211 @@ +import type { + BenchData, + CountryCode, + DashboardBundle, + ViewKey, +} from "../types"; +import { + metricTypeForVariable, + outputGroupForVariable, + scorePrediction, + type ScoreRow, +} from "./scoring"; + +export type SensitivityViewId = + | "main" + | "amount_only" + | "binary_only" + | "positive_only" + | "zero_only"; + +export type SensitivityView = { + id: SensitivityViewId; + label: string; + description: string; +}; + +export const SENSITIVITY_VIEWS: SensitivityView[] = [ + { + id: "main", + label: "Main", + description: "Equal-weight average across output groups; baseline ranking.", + }, + { + id: "amount_only", + label: "Amount only", + description: "Drops binary coverage flags; ranks on amount outputs only.", + }, + { + id: "binary_only", + label: "Binary only", + description: "Restricts to binary coverage outputs.", + }, + { + id: "positive_only", + label: "Positive cases", + description: "Restricts to rows where the reference value is non-zero.", + }, + { + id: "zero_only", + label: "Zero cases", + description: "Restricts to rows where the reference value is zero.", + }, +]; + +export type ScenarioRow = { + country: CountryCode; + scenarioId: string; + outputGroup: string; + model: string; + score: number; +}; + +function buildRows(country: CountryCode, payload: BenchData): ScoreRow[] { + const rows: ScoreRow[] = []; + for (const [scenarioId, variableMap] of Object.entries( + payload.scenarioPredictions, + )) { + for (const [variable, modelMap] of Object.entries(variableMap)) { + const outputGroup = outputGroupForVariable(variable); + const metricType = metricTypeForVariable(variable, country); + for (const [model, record] of Object.entries(modelMap)) { + rows.push({ + country, + scenarioId, + variable, + outputGroup, + model, + truth: record.groundTruth, + prediction: record.prediction, + metricType, + score: scorePrediction( + variable, + country, + record.groundTruth, + record.prediction, + ), + }); + } + } + } + return rows; +} + +export function buildAllRows(dashboard: DashboardBundle): ScoreRow[] { + const rows: ScoreRow[] = []; + for (const country of ["us", "uk"] as CountryCode[]) { + const payload = dashboard.countries[country]; + if (!payload) continue; + rows.push(...buildRows(country, payload)); + } + return rows; +} + +function filterRows(rows: ScoreRow[], view: SensitivityViewId): ScoreRow[] { + switch (view) { + case "main": + return rows; + case "amount_only": + return rows.filter((row) => row.metricType === "amount"); + case "binary_only": + return rows.filter((row) => row.metricType === "binary"); + case "positive_only": + return rows.filter((row) => row.truth !== 0); + case "zero_only": + return rows.filter((row) => row.truth === 0); + default: + return rows; + } +} + +function aggregateGroupMean( + rows: T[], + key: (row: T) => string, + value: (row: T) => number, +): Record { + const sums = new Map(); + for (const row of rows) { + const k = key(row); + const v = value(row); + if (!Number.isFinite(v)) continue; + const cur = sums.get(k) ?? { sum: 0, count: 0 }; + cur.sum += v; + cur.count += 1; + sums.set(k, cur); + } + const out: Record = {}; + for (const [k, { sum, count }] of sums) { + if (count > 0) out[k] = sum / count; + } + return out; +} + +export type ModelScore = { + model: string; + score: number; +}; + +function scoresPerCountryModel(rows: ScoreRow[]): Map< + string, + Map +> { + // First reduce to (country, model, output_group) means. + const groupKey = (row: ScoreRow) => + `${row.country}|${row.model}|${row.outputGroup}`; + const outputMeans = aggregateGroupMean(rows, groupKey, (row) => row.score * 100); + // Then average the output groups by (country, model). + const buckets = new Map(); + for (const [k, mean] of Object.entries(outputMeans)) { + const [country, model] = k.split("|"); + const bk = `${country}|${model}`; + const cur = buckets.get(bk) ?? { sum: 0, count: 0 }; + cur.sum += mean; + cur.count += 1; + buckets.set(bk, cur); + } + // Reshape into Map>. + const out = new Map>(); + for (const [bk, { sum, count }] of buckets) { + if (count === 0) continue; + const [country, model] = bk.split("|"); + if (!out.has(country)) out.set(country, new Map()); + out.get(country)!.set(model, sum / count); + } + return out; +} + +export function modelScoresForView( + rows: ScoreRow[], + view: SensitivityViewId, + selectedView: ViewKey, +): ModelScore[] { + const filtered = filterRows(rows, view); + const perCountry = scoresPerCountryModel(filtered); + if (selectedView === "global") { + const allModels = new Set(); + for (const map of perCountry.values()) { + for (const m of map.keys()) allModels.add(m); + } + const out: ModelScore[] = []; + for (const model of allModels) { + const present: number[] = []; + for (const map of perCountry.values()) { + const s = map.get(model); + if (s !== undefined && Number.isFinite(s)) present.push(s); + } + // Global score requires presence in both countries. + if (present.length === perCountry.size && perCountry.size > 0) { + out.push({ + model, + score: present.reduce((a, b) => a + b, 0) / present.length, + }); + } + } + return out.sort((a, b) => b.score - a.score); + } + const map = perCountry.get(selectedView); + if (!map) return []; + return [...map.entries()] + .map(([model, score]) => ({ model, score })) + .sort((a, b) => b.score - a.score); +} From 3a23f3ba687cc61e4f9515c9fd50b330dcde6690 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 6 May 2026 08:43:51 -0400 Subject: [PATCH 2/5] Address review findings on the site tier-1 PR - bootstrap.ts now sums per-row sums and counts directly when aggregating output-group means inside each draw, so the bootstrap estimator matches the canonical headline scoring rule (each row contributes equally to the output-group mean instead of being collapsed to a per-scenario mean first). - modelScoresForView and bootstrapIntervals require every required country to have rows under the active sensitivity slice before returning a global ranking. ModelLeaderboard falls back to Main when a slice has no rows in one country (e.g. "Binary only" with no UK binary outputs) and surfaces a notice; sensitivity buttons that cannot apply globally are aria-disabled with a tooltip. - Sensitivity selector and country view selector now expose role, aria-label, and aria-pressed state. - SiteHeader collapsed nav items and action link are no longer keyboard-focusable while hidden (tabIndex=-1, aria-hidden). - useScrollProgress no longer subscribes to scroll when alwaysExpanded, and DEFAULT_DRAWS is exported and used as the single source for the bootstrap draw count (400). - .gitignore restores the Python lib/ blanket ignore and adds an explicit !app/src/lib/ + !app/src/lib/** allowlist so app/src/lib is tracked while nested lib/ directories elsewhere stay ignored. --- .gitignore | 8 +- app/src/components/ModelLeaderboard.tsx | 104 ++++++++++++++++++------ app/src/components/SiteHeader.tsx | 29 +++++-- app/src/lib/bootstrap.ts | 35 +++++--- app/src/lib/sensitivity.ts | 33 ++++++-- 5 files changed, 159 insertions(+), 50 deletions(-) diff --git a/.gitignore b/.gitignore index bb3b31f..89d0192 100644 --- a/.gitignore +++ b/.gitignore @@ -33,8 +33,12 @@ dist/ downloads/ eggs/ .eggs/ -/lib/ -/lib64/ +lib/ +lib64/ +# Tracked TypeScript helpers under app/src/lib/ — exempted from the +# Python-style lib/ blanket ignore above. +!app/src/lib/ +!app/src/lib/** parts/ sdist/ var/ diff --git a/app/src/components/ModelLeaderboard.tsx b/app/src/components/ModelLeaderboard.tsx index a680bfc..1edba39 100644 --- a/app/src/components/ModelLeaderboard.tsx +++ b/app/src/components/ModelLeaderboard.tsx @@ -17,9 +17,14 @@ import { SENSITIVITY_VIEWS, buildAllRows, modelScoresForView, + viewSupportsGlobal, type SensitivityViewId, } from "../lib/sensitivity"; -import { bootstrapIntervals, viewToFilter } from "../lib/bootstrap"; +import { + DEFAULT_DRAWS, + bootstrapIntervals, + viewToFilter, +} from "../lib/bootstrap"; function Badge({ children, @@ -117,9 +122,24 @@ export default function ModelLeaderboard({ const allRows = useMemo(() => buildAllRows(dashboard), [dashboard]); + // Some sensitivity slices have no rows in one country (e.g. "Binary only" + // has zero UK rows). In that case the global view cannot be a true + // cross-country score; fall back to the canonical Main view so the global + // tab still has a defensible ranking and surface a notice on the leaderboard. + const globalUnsupportedForView = useMemo( + () => + isGlobal && + sensitivityView !== "main" && + !viewSupportsGlobal(allRows, sensitivityView), + [allRows, isGlobal, sensitivityView], + ); + const effectiveView: SensitivityViewId = globalUnsupportedForView + ? "main" + : sensitivityView; + const sensitivityScores = useMemo(() => { - return modelScoresForView(allRows, sensitivityView, selectedView); - }, [allRows, sensitivityView, selectedView]); + return modelScoresForView(allRows, effectiveView, selectedView); + }, [allRows, effectiveView, selectedView]); const sensitivityScoreByModel = useMemo(() => { const out = new Map(); @@ -129,7 +149,7 @@ export default function ModelLeaderboard({ const noTools = useMemo(() => { const base = data.modelStats.filter((m) => m.condition === "no_tools"); - if (sensitivityView === "main") { + if (effectiveView === "main") { return [...base].sort((a, b) => b.score - a.score); } // Reorder + replace score with the sensitivity-view score, dropping models @@ -138,16 +158,16 @@ export default function ModelLeaderboard({ .filter((m) => sensitivityScoreByModel.has(m.model)) .map((m) => ({ ...m, score: sensitivityScoreByModel.get(m.model)! })) .sort((a, b) => b.score - a.score); - }, [data, sensitivityView, sensitivityScoreByModel]); + }, [data, effectiveView, sensitivityScoreByModel]); const intervals = useMemo(() => { return bootstrapIntervals( allRows, selectedView, - viewToFilter(sensitivityView), - { draws: 400, seed: 42 }, + viewToFilter(effectiveView), + { draws: DEFAULT_DRAWS, seed: 42 }, ); - }, [allRows, selectedView, sensitivityView]); + }, [allRows, selectedView, effectiveView]); const pendingModels = useMemo(() => { const present = new Set(noTools.map((model) => model.model)); @@ -209,30 +229,64 @@ export default function ModelLeaderboard({ className="mt-5 flex flex-wrap items-center gap-3 animate-fade-up" style={{ animationDelay: "200ms" }} > - + View -
- {SENSITIVITY_VIEWS.map((view) => ( - - ))} +
+ {SENSITIVITY_VIEWS.map((view) => { + const isActive = sensitivityView === view.id; + const disabledForGlobal = + isGlobal && + view.id !== "main" && + !viewSupportsGlobal(allRows, view.id); + return ( + + ); + })}
{activeView.description}
+ {globalUnsupportedForView && ( +

+ The {activeView.label.toLowerCase()} slice has no rows in at least + one country, so the global ranking falls back to the Main view. + Switch to United States or United Kingdom to see this slice on a + single country. +

+ )}
+
{views.map((view) => (